From bd7423f4f0a02c226951760268b131043d93ee95 Mon Sep 17 00:00:00 2001
From: Brad Larsen <bradford.larsen@praetorian.com>
Date: Wed, 6 Mar 2024 16:32:12 -0500
Subject: [PATCH 01/56] Add CMake options for more build granularity

This adds three new CMake options, all defaulting to true, making it
possible to opt-out of building parts of Vectorscan that are not
essential for deployment of the matching runtime.

These new options:

- `BUILD_UNIT`: control whether the `unit` directory is included
- `BUILD_DOC`: control whether the `doc` directory is included
- `BUILD_TOOLS`: control whether the `tools` directory is included
---
 CMakeLists.txt | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7e07a9a..c6952f41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1221,11 +1221,17 @@ if (NOT BUILD_STATIC_LIBS)
 endif ()
 
 add_subdirectory(util)
-add_subdirectory(unit)
 
-if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
+option(BUILD_UNIT "Build Hyperscan unit tests (default TRUE)" TRUE)
+if(BUILD_UNIT)
+    add_subdirectory(unit)
+endif()
+
+option(BUILD_TOOLS "Build Hyperscan tools (default TRUE)" TRUE)
+if(EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt AND BUILD_TOOLS)
     add_subdirectory(tools)
 endif()
+
 if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
     add_subdirectory(chimera)
 endif()
@@ -1240,4 +1246,7 @@ if(BUILD_BENCHMARKS)
     add_subdirectory(benchmarks)
 endif()
 
-add_subdirectory(doc/dev-reference)
+option(BUILD_DOC "Build the Hyperscan documentation (default TRUE)" TRUE)
+if(BUILD_DOC)
+    add_subdirectory(doc/dev-reference)
+endif()

From d9a75dc3b96b4e1bf08253dd95f81663ba49acde Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Thu, 15 Feb 2024 14:39:42 -0600
Subject: [PATCH 02/56] documentation: Add cmake option to build man pages

Man pages tend to be preferred in some circles, lets add an
option to build the vectorscan documentation that way.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 doc/dev-reference/CMakeLists.txt | 11 +++++++++++
 doc/dev-reference/conf.py.in     |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/dev-reference/CMakeLists.txt b/doc/dev-reference/CMakeLists.txt
index 449589f6..6f48e2e4 100644
--- a/doc/dev-reference/CMakeLists.txt
+++ b/doc/dev-reference/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
 set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
 set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+set(SPHINX_MAN_DIR "${CMAKE_CURRENT_BINARY_DIR}/man")
 
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
     "${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
@@ -32,4 +33,14 @@ add_custom_target(dev-reference
         "${SPHINX_HTML_DIR}"
     DEPENDS dev-reference-doxygen
     COMMENT "Building HTML dev reference with Sphinx")
+
+add_custom_target(dev-reference-man
+    ${SPHINX_BUILD}
+        -b man
+        -c "${CMAKE_CURRENT_BINARY_DIR}"
+        -d "${SPHINX_CACHE_DIR}"
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${SPHINX_MAN_DIR}"
+    DEPENDS dev-reference-doxygen
+    COMMENT "Building man page reference with Sphinx")
 endif()
diff --git a/doc/dev-reference/conf.py.in b/doc/dev-reference/conf.py.in
index d0ef371b..ad97f088 100644
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@@ -233,7 +233,7 @@ latex_documents = [
 # (source start file, name, description, authors, manual section).
 man_pages = [
     ('index', 'hyperscan', u'Hyperscan Documentation',
-     [u'Intel Corporation'], 1)
+     [u'Intel Corporation'], 7)
 ]
 
 # If true, show URL addresses after external links.

From 2d23d24b678f39c92a9bc8b41af241b3701b73f1 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Thu, 15 Feb 2024 14:51:11 -0600
Subject: [PATCH 03/56] documentation: Update project name and copyright

The project name in the documentation should probably
be updated to reflect that this is vectorscan. Update
the copyright too.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 doc/dev-reference/conf.py.in | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/dev-reference/conf.py.in b/doc/dev-reference/conf.py.in
index ad97f088..298a54b1 100644
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Hyperscan documentation build configuration file, created by
+# Vectorscan documentation build configuration file, created by
 # sphinx-quickstart on Tue Sep 29 15:59:19 2015.
 #
 # This file is execfile()d with the current directory set to its
@@ -43,8 +43,8 @@ source_suffix = '.rst'
 master_doc = 'index'
 
 # General information about the project.
-project = u'Hyperscan'
-copyright = u'2015-2018, Intel Corporation'
+project = u'Vectorscan'
+copyright = u'2015-2020, Intel Corporation; 2020-2024, VectorCamp; and other contributors'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -202,7 +202,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  ('index', 'Hyperscan.tex', u'Hyperscan Documentation',
+  ('index', 'Hyperscan.tex', u'Vectorscan Documentation',
    u'Intel Corporation', 'manual'),
 ]
 
@@ -232,7 +232,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'hyperscan', u'Hyperscan Documentation',
+    ('index', 'vectorscan', u'Vectorscan Documentation',
      [u'Intel Corporation'], 7)
 ]
 
@@ -246,8 +246,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'Hyperscan', u'Hyperscan Documentation',
-   u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
+  ('index', 'Vectorscan', u'Vectorscan Documentation',
+   u'Intel Corporation; VectorCamp', 'Vectorscan', 'High-performance regular expression matcher.',
    'Miscellaneous'),
 ]
 

From 943f198ebf641c7511b12b8b3bb7ead8a6681228 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Thu, 15 Feb 2024 15:13:20 -0600
Subject: [PATCH 04/56] documentation: Replace project name with Vectorscan and
 general updates

The generated documentation continues to refer to Hyperscan
despite the project now being VectorScan. Lets replace many
of the Hyperscan references with Vectorscan.

At the same time, lets resync the documentation here with the
vectorscan readme. This updates the supported platforms/compilers
and build options.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 doc/dev-reference/chimera.rst         |  22 ++--
 doc/dev-reference/compilation.rst     |  92 +++++++--------
 doc/dev-reference/getting_started.rst | 159 +++++++++++++++++---------
 doc/dev-reference/index.rst           |   2 +-
 doc/dev-reference/intro.rst           |  22 ++--
 doc/dev-reference/performance.rst     |  22 ++--
 doc/dev-reference/preface.rst         |  18 +--
 doc/dev-reference/runtime.rst         |  24 ++--
 doc/dev-reference/serialization.rst   |  20 ++--
 doc/dev-reference/tools.rst           |  44 +++----
 10 files changed, 239 insertions(+), 186 deletions(-)

diff --git a/doc/dev-reference/chimera.rst b/doc/dev-reference/chimera.rst
index d35b116f..cb8c84c4 100644
--- a/doc/dev-reference/chimera.rst
+++ b/doc/dev-reference/chimera.rst
@@ -11,10 +11,10 @@ Introduction
 ************
 
 Chimera is a software regular expression matching engine that is a hybrid of
-Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE
-syntax as well as to take advantage of the high performance nature of Hyperscan.
+Vectorscan and PCRE. The design goals of Chimera are to fully support PCRE
+syntax as well as to take advantage of the high performance nature of Vectorscan.
 
-Chimera inherits the design guideline of Hyperscan with C APIs for compilation
+Chimera inherits the design guideline of Vectorscan with C APIs for compilation
 and scanning.
 
 The Chimera API itself is composed of two major components:
@@ -65,13 +65,13 @@ For a given database, Chimera provides several guarantees:
 .. note:: Chimera is designed to have the same matching behavior as PCRE,
    including greedy/ungreedy, capturing, etc. Chimera reports both
    **start offset** and **end offset** for each match like PCRE. Different
-   from the fashion of reporting all matches in Hyperscan, Chimera only reports
+   from the fashion of reporting all matches in Vectorscan, Chimera only reports
    non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
    match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).
 
-.. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support
+.. note:: Since Chimera is a hybrid of Vectorscan and PCRE in order to support
    full PCRE syntax, there will be extra performance overhead compared to
-   Hyperscan-only solution. Please always use Hyperscan for better performance
+   Vectorscan-only solution. Please always use Vectorscan for better performance
    unless you must need full PCRE syntax support.
 
 See :ref:`chruntime` for more details
@@ -83,12 +83,12 @@ Requirements
 The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.
 
 .. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
-   directory under Hyperscan root directory in order to build Chimera.
+   directory under Vectorscan root directory in order to build Chimera.
 
-Beside this, both hardware and software requirements of Chimera are the same to Hyperscan.
+Beside this, both hardware and software requirements of Chimera are the same to Vectorscan.
 See :ref:`hardware` and :ref:`software` for more details.
 
-.. note:: Building Hyperscan will automatically generate Chimera library.
+.. note:: Building Vectorscan will automatically generate Chimera library.
    Currently only static library is supported for Chimera, so please
    use static build type when configure CMake build options.
 
@@ -119,7 +119,7 @@ databases:
 
 Compilation allows the Chimera library to analyze the given pattern(s) and
 pre-determine how to scan for these patterns in an optimized fashion using
-Hyperscan and PCRE.
+Vectorscan and PCRE.
 
 ===============
 Pattern Support
@@ -134,7 +134,7 @@ Semantics
 =========
 
 Chimera supports the exact same semantics of PCRE library. Moreover, it supports
-multiple simultaneous pattern matching like Hyperscan and the multiple matches
+multiple simultaneous pattern matching like Vectorscan and the multiple matches
 will be reported in order by end offset.
 
 .. _chruntime:
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 6f5541ec..a0ae8c8b 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -9,7 +9,7 @@ Compiling Patterns
 Building a Database
 *******************
 
-The Hyperscan compiler API accepts regular expressions and converts them into a
+The Vectorscan compiler API accepts regular expressions and converts them into a
 compiled pattern database that can then be used to scan data.
 
 The API provides three functions that compile regular expressions into
@@ -24,7 +24,7 @@ databases:
 #. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
    but allows :ref:`extparam` to be specified for each expression.
 
-Compilation allows the Hyperscan library to analyze the given pattern(s) and
+Compilation allows the Vectorscan library to analyze the given pattern(s) and
 pre-determine how to scan for these patterns in an optimized fashion that would
 be far too expensive to compute at run-time.
 
@@ -48,10 +48,10 @@ To compile patterns to be used in streaming mode, the ``mode`` parameter of
 block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
 requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
 for one mode (streaming, block or vectored) can only be used in that mode. The
-version of Hyperscan used to produce a compiled pattern database must match the
-version of Hyperscan used to scan with it.
+version of Vectorscan used to produce a compiled pattern database must match the
+version of Vectorscan used to scan with it.
 
-Hyperscan provides support for targeting a database at a particular CPU
+Vectorscan provides support for targeting a database at a particular CPU
 platform; see :ref:`instr_specialization` for details.
 
 =====================
@@ -75,14 +75,14 @@ characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.
 
-Hyperscan is initially designed to process common regular expressions. It is
+Vectorscan is initially designed to process common regular expressions. It is
 hence embedded with a complex parser to do comprehensive regular grammar
 interpretation. Particularly, the identification of above meta characters is the
 basic step for the interpretation of far more complex regular grammars.
 
 However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
-regular meta characters. Supposing fed directly into traditional Hyperscan
+regular meta characters. Supposing fed directly into traditional Vectorscan
 compile API, all these meta characters will be interpreted in predefined ways,
 which is unnecessary and the result is totally out of expectation. To avoid
 such misunderstanding by traditional API, users have to preprocess these
@@ -90,7 +90,7 @@ literal patterns by converting the meta characters into some other formats:
 either by adding a backslash ``\`` before certain meta characters, or by
 converting all the characters into a hexadecimal representation.
 
-In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
+In ``v5.2.0``, Vectorscan introduces 2 new compile APIs for pure literal patterns:
 
 #. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
    database.
@@ -106,7 +106,7 @@ content directly into these APIs without worrying about writing regular meta
 characters in their patterns. No preprocessing work is needed any more.
 
 For new APIs, the ``length`` of each literal pattern is a newly added parameter.
-Hyperscan needs to locate the end position of the input expression via clearly
+Vectorscan needs to locate the end position of the input expression via clearly
 knowing each literal's length, not by simply identifying character ``\0`` of a
 string.
 
@@ -127,19 +127,19 @@ Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
 Pattern Support
 ***************
 
-Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
+Vectorscan supports the pattern syntax used by the PCRE library ("libpcre"),
 described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 
-The version of PCRE used to validate Hyperscan's interpretation of this syntax
+The version of PCRE used to validate Vectorscan's interpretation of this syntax
 is 8.41 or above.
 
 ====================
 Supported Constructs
 ====================
 
-The following regex constructs are supported by Hyperscan:
+The following regex constructs are supported by Vectorscan:
 
 * Literal characters and strings, with all libpcre quoting and character
   escapes.
@@ -177,7 +177,7 @@ The following regex constructs are supported by Hyperscan:
       :c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
 
   * Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
-    :regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
+    :regexp:`\\w+?`) are supported but ignored (as Vectorscan reports all
     matches).
 
 * Parenthesization, including the named and unnamed capturing and
@@ -219,15 +219,15 @@ The following regex constructs are supported by Hyperscan:
 .. note:: At this time, not all patterns can be successfully compiled with the
   :c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
   :ref:`som`. The patterns that support this flag are a subset of patterns that
-  can be successfully compiled with Hyperscan; notably, many bounded repeat
-  forms that can be compiled with Hyperscan without the Start of Match flag
+  can be successfully compiled with Vectorscan; notably, many bounded repeat
+  forms that can be compiled with Vectorscan without the Start of Match flag
   enabled cannot be compiled with the flag enabled.
 
 ======================
 Unsupported Constructs
 ======================
 
-The following regex constructs are not supported by Hyperscan:
+The following regex constructs are not supported by Vectorscan:
 
 * Backreferences and capturing sub-expressions.
 * Arbitrary zero-width assertions.
@@ -246,32 +246,32 @@ The following regex constructs are not supported by Hyperscan:
 Semantics
 *********
 
-While Hyperscan follows libpcre syntax, it provides different semantics. The
+While Vectorscan follows libpcre syntax, it provides different semantics. The
 major departures from libpcre semantics are motivated by the requirements of
 streaming and multiple simultaneous pattern matching.
 
 The major departures from libpcre semantics are:
 
-#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
+#. **Multiple pattern matching**: Vectorscan allows matches to be reported for
    several patterns simultaneously. This is not equivalent to separating the
    patterns by :regexp:`|` in libpcre, which evaluates alternations
    left-to-right.
 
-#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
+#. **Lack of ordering**: the multiple matches that Vectorscan produces are not
    guaranteed to be ordered, although they will always fall within the bounds of
    the current scan.
 
-#. **End offsets only**: Hyperscan's default behaviour is only to report the end
+#. **End offsets only**: Vectorscan's default behaviour is only to report the end
    offset of a match. Reporting of the start offset can be enabled with
    per-expression flags at pattern compile time. See :ref:`som` for details.
 
 #. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
-   ``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
+   ``fooxyzbarbar`` will return two matches from Vectorscan -- at the points
    corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
    libpcre semantics by default would report only one match at ``fooxyzbarbar``
    (greedy semantics) or, if non-greedy semantics were switched on, one match at
    ``fooxyzbar``. This means that switching between greedy and non-greedy
-   semantics is a no-op in Hyperscan.
+   semantics is a no-op in Vectorscan.
 
 To support libpcre quantifier semantics while accurately reporting streaming
 matches at the time they occur is impossible. For example, consider the pattern
@@ -299,7 +299,7 @@ as in block 3 -- which would constitute a better match for the pattern.
 Start of Match
 ==============
 
-In standard operation, Hyperscan will only provide the end offset of a match
+In standard operation, Vectorscan will only provide the end offset of a match
 when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
 is specified for a particular pattern, then the same set of matches is
 returned, but each match will also provide the leftmost possible start offset
@@ -308,7 +308,7 @@ corresponding to its end offset.
 Using the SOM flag entails a number of trade-offs and limitations:
 
 * Reduced pattern support: For many patterns, tracking SOM is complex and can
-  result in Hyperscan failing to compile a pattern with a "Pattern too
+  result in Vectorscan failing to compile a pattern with a "Pattern too
   large" error, even if the pattern is supported in normal operation.
 * Increased stream state: At scan time, state space is required to track
   potential SOM offsets, and this must be stored in persistent stream state in
@@ -316,20 +316,20 @@ Using the SOM flag entails a number of trade-offs and limitations:
   required to match a pattern.
 * Performance overhead: Similarly, there is generally a performance cost
   associated with tracking SOM.
-* Incompatible features: Some other Hyperscan pattern flags (such as
+* Incompatible features: Some other Vectorscan pattern flags (such as
   :c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
   used in combination with SOM. Specifying them together with
   :c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
 
 In streaming mode, the amount of precision delivered by SOM can be controlled
-with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
+with the SOM horizon flags. These instruct Vectorscan to deliver accurate SOM
 information within a certain distance of the end offset, and return a special
 start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
 small or medium SOM horizon will usually reduce the stream state required for a
 given database.
 
 .. note:: In streaming mode, the start offset returned for a match may refer to
-   a point in the stream *before* the current block being scanned. Hyperscan
+   a point in the stream *before* the current block being scanned. Vectorscan
    provides no facility for accessing earlier blocks; if the calling application
    needs to inspect historical data, then it must store it itself.
 
@@ -341,7 +341,7 @@ Extended Parameters
 
 In some circumstances, more control over the matching behaviour of a pattern is
 required than can be specified easily using regular expression syntax. For
-these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
+these scenarios, Vectorscan provides the :c:func:`hs_compile_ext_multi` function
 that allows a set of "extended parameters" to be set on a per-pattern basis.
 
 Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
@@ -383,18 +383,18 @@ section.
 Prefiltering Mode
 =================
 
-Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
-be used to implement a prefilter for a pattern than Hyperscan would not
+Vectorscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
+be used to implement a prefilter for a pattern than Vectorscan would not
 ordinarily support.
 
-This flag instructs Hyperscan to compile an "approximate" version of this
-pattern for use in a prefiltering application, even if Hyperscan does not
+This flag instructs Vectorscan to compile an "approximate" version of this
+pattern for use in a prefiltering application, even if Vectorscan does not
 support the pattern in normal operation.
 
 The set of matches returned when this flag is used is guaranteed to be a
 superset of the matches specified by the non-prefiltering expression.
 
-If the pattern contains pattern constructs not supported by Hyperscan (such as
+If the pattern contains pattern constructs not supported by Vectorscan (such as
 zero-width assertions, back-references or conditional references) these
 constructs will be replaced internally with broader constructs that may match
 more often.
@@ -404,7 +404,7 @@ back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
 approximated by having its back-reference replaced with its referent, forming
 :regexp:`/\\w+ again \\w+/`.
 
-Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
+Furthermore, in prefiltering mode Vectorscan may simplify a pattern that would
 otherwise return a "Pattern too large" error at compile time, or for performance
 reasons (subject to the matching guarantee above).
 
@@ -422,22 +422,22 @@ matches for the pattern.
 Instruction Set Specialization
 ******************************
 
-Hyperscan is able to make use of several modern instruction set features found
+Vectorscan is able to make use of several modern instruction set features found
 on x86 processors to provide improvements in scanning performance.
 
 Some of these features are selected when the library is built; for example,
-Hyperscan will use the native ``POPCNT`` instruction on processors where it is
+Vectorscan will use the native ``POPCNT`` instruction on processors where it is
 available and the library has been optimized for the host architecture.
 
-.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
+.. note:: By default, the Vectorscan runtime is built with the ``-march=native``
    compiler flag and (where possible) will make use of all instructions known by
    the host's C compiler.
 
-To use some instruction set features, however, Hyperscan must build a
+To use some instruction set features, however, Vectorscan must build a
 specialized database to support them. This means that the target platform must
 be specified at pattern compile time.
 
-The Hyperscan compiler API functions all accept an optional
+The Vectorscan compiler API functions all accept an optional
 :c:type:`hs_platform_info_t` argument, which describes the target platform
 for the database to be built. If this argument is NULL, the database will be
 targeted at the current host platform.
@@ -467,7 +467,7 @@ See :ref:`api_constants` for the full list of CPU tuning and feature flags.
 Approximate matching
 ********************
 
-Hyperscan provides an experimental approximate matching mode, which will match
+Vectorscan provides an experimental approximate matching mode, which will match
 patterns within a given edit distance. The exact matching behavior is defined as
 follows:
 
@@ -492,7 +492,7 @@ follows:
 
 Here are a few examples of approximate matching:
 
-* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
+* Pattern :regexp:`/foo/` can match ``foo`` when using regular Vectorscan
   matching behavior. With approximate matching within edit distance 2, the
   pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
   ``f``, and anything else that lies within edit distance 2 of matching corpora
@@ -513,7 +513,7 @@ matching support. Here they are, in a nutshell:
 * Reduced pattern support:
 
   * For many patterns, approximate matching is complex and can result in
-    Hyperscan failing to compile a pattern with a "Pattern too large" error,
+    Vectorscan failing to compile a pattern with a "Pattern too large" error,
     even if the pattern is supported in normal operation.
   * Additionally, some patterns cannot be approximately matched because they
     reduce to so-called "vacuous" patterns (patterns that match everything). For
@@ -548,7 +548,7 @@ Logical Combinations
 ********************
 
 For situations when a user requires behaviour that depends on the presence or
-absence of matches from groups of patterns, Hyperscan provides support for the
+absence of matches from groups of patterns, Vectorscan provides support for the
 logical combination of patterns in a given pattern set, with three operators:
 ``NOT``, ``AND`` and ``OR``.
 
@@ -561,7 +561,7 @@ offset is *true* if the expression it refers to is *false* at this offset.
 For example, ``NOT 101`` means that expression 101 has not yet matched at this
 offset.
 
-A logical combination is passed to Hyperscan at compile time as an expression.
+A logical combination is passed to Vectorscan at compile time as an expression.
 This combination expression will raise matches at every offset where one of its
 sub-expressions matches and the logical value of the whole expression is *true*.
 
@@ -603,7 +603,7 @@ In a logical combination expression:
  * Whitespace is ignored.
 
 To use a logical combination expression, it must be passed to one of the
-Hyperscan compile functions (:c:func:`hs_compile_multi`,
+Vectorscan compile functions (:c:func:`hs_compile_multi`,
 :c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
 which identifies the pattern as a logical combination expression. The patterns
 referred to in the logical combination expression must be compiled together in
@@ -613,7 +613,7 @@ When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores
 all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
 :c:member:`HS_FLAG_QUIET` flag.
 
-Hyperscan will accept logical combination expressions at compile time that
+Vectorscan will accept logical combination expressions at compile time that
 evaluate to *true* when no patterns have matched, and report the match for
 combination at end of data if no patterns have matched; for example: ::
 
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index aaff15ba..57d78211 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -7,43 +7,41 @@ Getting Started
 Very Quick Start
 ****************
 
-#. Clone Hyperscan ::
+#. Clone Vectorscan ::
 
-     cd <where-you-want-hyperscan-source>
-     git clone git://github.com/intel/hyperscan
+     cd <where-you-want-vectorscan-source>
+     git clone https://github.com/VectorCamp/vectorscan
 
-#. Configure Hyperscan
+#. Configure Vectorscan
 
    Ensure that you have the correct :ref:`dependencies <software>` present,
    and then:
 
    ::
 
-     cd <where-you-want-to-build-hyperscan>
+     cd <where-you-want-to-build-vectorscan>
      mkdir <build-dir>
      cd <build-dir>
-     cmake [-G <generator>] [options] <hyperscan-source-path>
+     cmake [-G <generator>] [options] <vectorscan-source-path>
 
    Known working generators:
       * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
       * ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
-      * ``Visual Studio 15 2017`` --- Visual Studio projects
 
-   Generators that might work include:
+   Unsupported generators that might work include:
       * ``Xcode`` --- OS X Xcode projects.
 
-#. Build Hyperscan
+#. Build Vectorscan
 
    Depending on the generator used:
      * ``cmake --build .`` --- will build everything
      * ``make -j<jobs>`` --- use makefiles in parallel
      * ``ninja`` --- use Ninja build
-     * ``MsBuild.exe`` --- use Visual Studio MsBuild
      * etc.
 
-#. Check Hyperscan
+#. Check Vectorscan
 
-   Run the Hyperscan unit tests: ::
+   Run the Vectorscan unit tests: ::
 
      bin/unit-hyperscan
 
@@ -55,20 +53,23 @@ Requirements
 Hardware
 ========
 
-Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
-32-bit (IA-32 Architecture) modes.
+Vectorscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
+32-bit (IA-32 Architecture) modes as well as Arm v8.0+ aarch64, and POWER 8+ ppc64le
+machines.
 
 Hyperscan is a high performance software library that takes advantage of recent
-Intel architecture advances. At a minimum, support for Supplemental Streaming
-SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
-x86 processor.
+architecture advances.
 
-Additionally, Hyperscan can make use of:
+Additionally, Vectorscan can make use of:
 
     * Intel Streaming SIMD Extensions 4.2 (SSE4.2)
     * the POPCNT instruction
     * Bit Manipulation Instructions (BMI, BMI2)
     * Intel Advanced Vector Extensions 2 (Intel AVX2)
+    * Arm NEON
+    * Arm SVE and SVE2
+    * Arm SVE2 BITPERM
+    * IBM Power8/Power9 VSX
 
 if present.
 
@@ -79,40 +80,34 @@ These can be determined at library compile time, see :ref:`target_arch`.
 Software
 ========
 
-As a software library, Hyperscan doesn't impose any particular runtime
-software requirements, however to build the Hyperscan library we require a
-modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
+As a software library, Vectorscan doesn't impose any particular runtime
+software requirements, however to build the Vectorscan library we require a
+modern C and C++ compiler -- in particular, Vectorscan requires C99 and C++17
 compiler support. The supported compilers are:
 
-    * GCC, v4.8.1 or higher
-    * Clang, v3.4 or higher (with libstdc++ or libc++)
-    * Intel C++ Compiler v15 or higher
-    * Visual C++ 2017 Build Tools
+    * GCC, v9 or higher
+    * Clang, v5 or higher (with libstdc++ or libc++)
 
-Examples of operating systems that Hyperscan is known to work on include:
+Examples of operating systems that Vectorscan is known to work on include:
 
 Linux:
 
-* Ubuntu 14.04 LTS or newer
+* Ubuntu 20.04 LTS or newer
 * RedHat/CentOS 7 or newer
+* Fedora 38 or newer
+* Debian 10
 
 FreeBSD:
 
 * 10.0 or newer
 
-Windows:
-
-* 8 or newer
-
 Mac OS X:
 
 * 10.8 or newer, using XCode/Clang
 
-Hyperscan *may* compile and run on other platforms, but there is no guarantee.
-We currently have experimental support for Windows using Intel C++ Compiler
-or Visual Studio 2017.
+Vectorscan *may* compile and run on other platforms, but there is no guarantee.
 
-In addition, the following software is required for compiling the Hyperscan library:
+In addition, the following software is required for compiling the Vectorscan library:
 
 ======================================================= =========== ======================================
 Dependency                                              Version     Notes
@@ -132,20 +127,20 @@ Ragel, you may use Cygwin to build it from source.
 Boost Headers
 -------------
 
-Compiling Hyperscan depends on a recent version of the Boost C++ header
+Compiling Vectorscan depends on a recent version of the Boost C++ header
 library. If the Boost libraries are installed on the build machine in the
 usual paths, CMake will find them. If the Boost libraries are not installed,
 the location of the Boost source tree can be specified during the CMake
 configuration step using the ``BOOST_ROOT`` variable (described below).
 
 Another alternative is to put a copy of (or a symlink to) the boost
-subdirectory in ``<hyperscan-source-path>/include/boost``.
+subdirectory in ``<vectorscanscan-source-path>/include/boost``.
 
 For example: for the Boost-1.59.0 release: ::
 
-    ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
+    ln -s boost_1_59_0/boost <vectorscan-source-path>/include/boost
 
-As Hyperscan uses the header-only parts of Boost, it is not necessary to
+As Vectorscan uses the header-only parts of Boost, it is not necessary to
 compile the Boost libraries.
 
 CMake Configuration
@@ -168,11 +163,12 @@ Common options for CMake include:
 |                        | Valid options are Debug, Release, RelWithDebInfo,  |
 |                        | and MinSizeRel. Default is RelWithDebInfo.         |
 +------------------------+----------------------------------------------------+
-| BUILD_SHARED_LIBS      | Build Hyperscan as a shared library instead of     |
+| BUILD_SHARED_LIBS      | Build Vectorscan as a shared library instead of    |
 |                        | the default static library.                        |
+|                        | Default: Off                                       |
 +------------------------+----------------------------------------------------+
-| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs.       |
-|                        | Default off.                                       |
+| BUILD_STATIC_LIBS      | Build Vectorscan as a static library.              |
+|                        | Default: On                                        |
 +------------------------+----------------------------------------------------+
 | BOOST_ROOT             | Location of Boost source tree.                     |
 +------------------------+----------------------------------------------------+
@@ -180,12 +176,64 @@ Common options for CMake include:
 +------------------------+----------------------------------------------------+
 | FAT_RUNTIME            | Build the :ref:`fat runtime<fat_runtime>`. Default |
 |                        | true on Linux, not available elsewhere.            |
+|                        | Default: Off                                       |
++------------------------+----------------------------------------------------+
+| USE_CPU_NATIVE         | Native CPU detection is off by default, however it |
+|                        | is possible to build a performance-oriented non-fat|
+|                        | library tuned to your CPU.                         |
+|                        | Default: Off                                       |
++------------------------+----------------------------------------------------+
+| SANITIZE               | Use libasan sanitizer to detect possible bugs.     |
+|                        | Valid options are address, memory and undefined.   |
++------------------------+----------------------------------------------------+
+| SIMDE_BACKEND          | Enable SIMDe backend. If this is chosen all native |
+|                        | (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be     |
+|                        | disabled and a SIMDe SSE4.2 emulation backend will |
+|                        | be enabled. This will enable Vectorscan to build   |
+|                        | and run on architectures without SIMD.             |
+|                        | Default: Off                                       |
++------------------------+----------------------------------------------------+
+| SIMDE_NATIVE           | Enable SIMDe native emulation of x86 SSE4.2        |
+|                        | intrinsics on the building platform. That is,      |
+|                        | SSE4.2 intrinsics will be emulated using Neon on   |
+|                        | an Arm platform, or VSX on a Power platform, etc.  |
+|                        | Default: Off                                       |
++------------------------+----------------------------------------------------+
+
+X86 platform specific options include:
+
++------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
++========================+====================================================+
+| BUILD_AVX2             | Enable code for AVX2.                              |
++------------------------+----------------------------------------------------+
+| BUILD_AVX512           | Enable code for AVX512. Implies BUILD_AVX2.        |
++------------------------+----------------------------------------------------+
+| BUILD_AVX512VBMI       | Enable code for AVX512 with VBMI extension. Implies|
+|                        | BUILD_AVX512.                                      |
++------------------------+----------------------------------------------------+
+
+Arm platform specific options include:
+
++------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
++========================+====================================================+
+| BUILD_SVE              | Enable code for SVE, like on AWS Graviton3 CPUs.   |
+|                        | Not much code is ported just for SVE , but enabling|
+|                        | SVE code production, does improve code generation, |
+|                        | see Benchmarks.                                    |
++------------------------+----------------------------------------------------+
+| BUILD_SVE2             | Enable code for SVE2, implies BUILD_SVE. Most      |
+|                        | non-Neon code is written for SVE2.                 |
++------------------------+----------------------------------------------------+
+| BUILD_SVE2_BITPERM     | Enable code for SVE2_BITPERM harwdare feature,     |
+|                        | implies BUILD_SVE2.                                |
 +------------------------+----------------------------------------------------+
 
 For example, to generate a ``Debug`` build: ::
 
     cd <build-dir>
-    cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
+    cmake -DCMAKE_BUILD_TYPE=Debug <vectorscan-source-path>
 
 
 
@@ -193,7 +241,7 @@ Build Type
 ----------
 
 CMake determines a number of features for a build based on the Build Type.
-Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
+Vectorscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
 information". This is a performance optimized build without runtime assertions
 but with debug symbols enabled.
 
@@ -201,7 +249,7 @@ The other types of builds are:
 
  * ``Release``: as above, but without debug symbols
  * ``MinSizeRel``: a stripped release build
- * ``Debug``: used when developing Hyperscan. Includes runtime assertions
+ * ``Debug``: used when developing Vectorscan. Includes runtime assertions
    (which has a large impact on runtime performance), and will also enable
    some other build features like building internal unit
    tests.
@@ -211,7 +259,7 @@ The other types of builds are:
 Target Architecture
 -------------------
 
-Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
+Unless using the :ref:`fat runtime<fat_runtime>`, by default Vectorscan will be
 compiled to target the instruction set of the processor of the machine that
 being used for compilation. This is done via the use of ``-march=native``. The
 result of this means that a library built on one machine may not work on a
@@ -223,7 +271,7 @@ CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. F
 example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
 
     cmake -DCMAKE_C_FLAGS="-march=corei7" \
-      -DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
+      -DCMAKE_CXX_FLAGS="-march=corei7" <vectorscan-source-path>
 
 For more information, refer to :ref:`instr_specialization`.
 
@@ -232,17 +280,17 @@ For more information, refer to :ref:`instr_specialization`.
 Fat Runtime
 -----------
 
-A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
+A feature introduced in Hyperscan v4.4 is the ability for the Vectorscan
 library to dispatch the most appropriate runtime code for the host processor.
-This feature is called the "fat runtime", as a single Hyperscan library
+This feature is called the "fat runtime", as a single Vectorscan library
 contains multiple copies of the runtime code for different instruction sets.
 
 .. note::
 
     The fat runtime feature is only available on Linux. Release builds of
-    Hyperscan will default to having the fat runtime enabled where supported.
+    Vectorscan will default to having the fat runtime enabled where supported.
 
-When building the library with the fat runtime, the Hyperscan runtime code
+When building the library with the fat runtime, the Vectorscan runtime code
 will be compiled multiple times for these different instruction sets, and
 these compiled objects are combined into one library. There are no changes to
 how user applications are built against this library.
@@ -254,11 +302,11 @@ resolved so that the right version of each API function is used. There is no
 impact on function call performance, as this check and resolution is performed
 by the ELF loader once when the binary is loaded.
 
-If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
+If the Vectorscan library is used on x86 systems without ``SSSE4.2``, the runtime
 API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
 instead of potentially executing illegal instructions. The API function
 :c:func:`hs_valid_platform` can be used by application writers to determine if
-the current platform is supported by Hyperscan.
+the current platform is supported by Vectorscan.
 
 As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
@@ -299,6 +347,11 @@ capability that is required, are the following:
 
         cmake -DBUILD_AVX512VBMI=on <...>
 
+    Vectorscan add support for Arm processors and SVE, SV2 and SVE2_BITPERM.
+    example: ::
+
+        cmake -DBUILD_SVE=ON -DBUILD_SVE2=ON -DBUILD_SVE2_BITPERM=ON <...>
+
 As the fat runtime requires compiler, libc, and binutils support, at this time
 it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
diff --git a/doc/dev-reference/index.rst b/doc/dev-reference/index.rst
index b5d6a54b..4046a298 100644
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@@ -1,5 +1,5 @@
 ###############################################
-Hyperscan |version| Developer's Reference Guide
+Vectorscan |version| Developer's Reference Guide
 ###############################################
 
 -------
diff --git a/doc/dev-reference/intro.rst b/doc/dev-reference/intro.rst
index 58879aef..71538eb0 100644
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@@ -5,11 +5,11 @@
 Introduction
 ############
 
-Hyperscan is a software regular expression matching engine designed with
+Vectorscan is a software regular expression matching engine designed with
 high performance and flexibility in mind. It is implemented as a library that
 exposes a straightforward C API.
 
-The Hyperscan API itself is composed of two major components:
+The Vectorscan API itself is composed of two major components:
 
 ***********
 Compilation
@@ -17,7 +17,7 @@ Compilation
 
 These functions take a group of regular expressions, along with identifiers and
 option flags, and compile them into an immutable database that can be used by
-the Hyperscan scanning API. This compilation process performs considerable
+the Vectorscan scanning API. This compilation process performs considerable
 analysis and optimization work in order to build a database that will match the
 given expressions efficiently.
 
@@ -36,8 +36,8 @@ See :ref:`compilation` for more detail.
 Scanning
 ********
 
-Once a Hyperscan database has been created, it can be used to scan data in
-memory. Hyperscan provides several scanning modes, depending on whether the
+Once a Vectorscan database has been created, it can be used to scan data in
+memory. Vectorscan provides several scanning modes, depending on whether the
 data to be scanned is available as a single contiguous block, whether it is
 distributed amongst several blocks in memory at the same time, or whether it is
 to be scanned as a sequence of blocks in a stream.
@@ -45,7 +45,7 @@ to be scanned as a sequence of blocks in a stream.
 Matches are delivered to the application via a user-supplied callback function
 that is called synchronously for each match.
 
-For a given database, Hyperscan provides several guarantees:
+For a given database, Vectorscan provides several guarantees:
 
 * No memory allocations occur at runtime with the exception of two
   fixed-size allocations, both of which should be done ahead of time for
@@ -56,7 +56,7 @@ For a given database, Hyperscan provides several guarantees:
     call.
   - **Stream state**: in streaming mode only, some state space is required to
     store data that persists between scan calls for each stream. This allows
-    Hyperscan to track matches that span multiple blocks of data.
+    Vectorscan to track matches that span multiple blocks of data.
 
 * The sizes of the scratch space and stream state (in streaming mode) required
   for a given database are fixed and determined at database compile time. This
@@ -64,7 +64,7 @@ For a given database, Hyperscan provides several guarantees:
   time, and these structures can be pre-allocated if required for performance
   reasons.
 
-* Any pattern that has successfully been compiled by the Hyperscan compiler can
+* Any pattern that has successfully been compiled by the Vectorscan compiler can
   be scanned against any input. There are no internal resource limits or other
   limitations at runtime that could cause a scan call to return an error.
 
@@ -74,12 +74,12 @@ See :ref:`runtime` for more detail.
 Tools
 *****
 
-Some utilities for testing and benchmarking Hyperscan are included with the
+Some utilities for testing and benchmarking Vectorscan are included with the
 library. See :ref:`tools` for more information.
 
 ************
 Example Code
 ************
 
-Some simple example code demonstrating the use of the Hyperscan API is
-available in the ``examples/`` subdirectory of the Hyperscan distribution.
+Some simple example code demonstrating the use of the Vectorscan API is
+available in the ``examples/`` subdirectory of the Vectorscan distribution.
diff --git a/doc/dev-reference/performance.rst b/doc/dev-reference/performance.rst
index 23781bd6..12074ea3 100644
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@@ -4,7 +4,7 @@
 Performance Considerations
 ##########################
 
-Hyperscan supports a wide range of patterns in all three scanning modes. It is
+Vectorscan supports a wide range of patterns in all three scanning modes. It is
 capable of extremely high levels of performance, but certain patterns can
 reduce performance markedly.
 
@@ -25,7 +25,7 @@ For example, caseless matching of :regexp:`/abc/` can be written as:
 * :regexp:`/(?i)abc(?-i)/`
 * :regexp:`/abc/i`
 
-Hyperscan is capable of handling all these constructs. Unless there is a
+Vectorscan is capable of handling all these constructs. Unless there is a
 specific reason otherwise, do not rewrite patterns from one form to another.
 
 As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
@@ -41,24 +41,24 @@ Library usage
 
 .. tip:: Do not hand-optimize library usage.
 
-The Hyperscan library is capable of dealing with small writes, unusually large
+The Vectorscan library is capable of dealing with small writes, unusually large
 and small pattern sets, etc. Unless there is a specific performance problem
-with some usage of the library, it is best to use Hyperscan in a simple and
+with some usage of the library, it is best to use Vectorscan in a simple and
 direct fashion. For example, it is unlikely for there to be much benefit in
 buffering input to the library into larger blocks unless streaming writes are
 tiny (say, 1-2 bytes at a time).
 
-Unlike many other pattern matching products, Hyperscan will run faster with
+Unlike many other pattern matching products, Vectorscan will run faster with
 small numbers of patterns and slower with large numbers of patterns in a smooth
 fashion (as opposed to, typically, running at a moderate speed up to some fixed
 limit then either breaking or running half as fast).
 
-Hyperscan also provides high-throughput matching with a single thread of
-control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
+Vectorscan also provides high-throughput matching with a single thread of
+control per core; if a database runs at 3.0 Gbps in Vectorscan it means that a
 3000-bit block of data will be scanned in 1 microsecond in a single thread of
 control, not that it is required to scan 22 3000-bit blocks of data in 22
 microseconds. Thus, it is not usually necessary to buffer data to supply
-Hyperscan with available parallelism.
+Vectorscan with available parallelism.
 
 ********************
 Block-based matching
@@ -72,7 +72,7 @@ accumulated before processing, it should be scanned in block rather than in
 streaming mode.
 
 Unnecessary use of streaming mode reduces the number of optimizations that can
-be applied in Hyperscan and may make some patterns run slower.
+be applied in Vectorscan and may make some patterns run slower.
 
 If there is a mixture of 'block' and 'streaming' mode patterns, these should be
 scanned in separate databases except in the case that the streaming patterns
@@ -107,7 +107,7 @@ Allocate scratch ahead of time
 
 Scratch allocation is not necessarily a cheap operation. Since it is the first
 time (after compilation or deserialization) that a pattern database is used,
-Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
+Vectorscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
 must also allocate memory.
 
 Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
@@ -329,7 +329,7 @@ Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
 the callback is considerably more expensive and general than
 :regexp:`/foo.{300}bar/`.
 
-Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
+Similarly, the :cpp:member:`hs_expr_ext::min_length` extended parameter can be
 used to specify a lower bound on the length of the matches for a pattern. Using
 this facility may be more lightweight in some circumstances than using the SOM
 flag and post-confirming match length in the calling application.
diff --git a/doc/dev-reference/preface.rst b/doc/dev-reference/preface.rst
index 68373b7f..5739690f 100644
--- a/doc/dev-reference/preface.rst
+++ b/doc/dev-reference/preface.rst
@@ -6,35 +6,35 @@ Preface
 Overview
 ********
 
-Hyperscan is a regular expression engine designed to offer high performance, the
+Vectorscan is a regular expression engine designed to offer high performance, the
 ability to match multiple expressions simultaneously and flexibility in
 scanning operation.
 
 Patterns are provided to a compilation interface which generates an immutable
 pattern database. The scan interface then can be used to scan a target data
 buffer for the given patterns, returning any matching results from that data
-buffer. Hyperscan also provides a streaming mode, in which matches that span
+buffer. Vectorscan also provides a streaming mode, in which matches that span
 several blocks in a stream are detected.
 
-This document is designed to facilitate code-level integration of the Hyperscan
+This document is designed to facilitate code-level integration of the Vectorscan
 library with existing or new applications.
 
-:ref:`intro` is a short overview of the Hyperscan library, with more detail on
-the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
+:ref:`intro` is a short overview of the Vectorscan library, with more detail on
+the Vectorscan API provided in the subsequent sections: :ref:`compilation` and
 :ref:`runtime`.
 
 :ref:`perf` provides details on various factors which may impact the
-performance of a Hyperscan integration.
+performance of a Vectorscan integration.
 
 :ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
-Hyperscan Application Programming Interface (API).
+Vectorscan Application Programming Interface (API).
 
 ********
 Audience
 ********
 
-This guide is aimed at developers interested in integrating Hyperscan into an
-application. For information on building the Hyperscan library, see the Quick
+This guide is aimed at developers interested in integrating Vectorscan into an
+application. For information on building the Vectorscan library, see the Quick
 Start Guide.
 
 ***********
diff --git a/doc/dev-reference/runtime.rst b/doc/dev-reference/runtime.rst
index 396521c9..249fd235 100644
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -4,7 +4,7 @@
 Scanning for Patterns
 #####################
 
-Hyperscan provides three different scanning modes, each with its own scan
+Vectorscan provides three different scanning modes, each with its own scan
 function beginning with ``hs_scan``. In addition, streaming mode has a number
 of other API functions for managing stream state.
 
@@ -33,8 +33,8 @@ See :c:type:`match_event_handler` for more information.
 Streaming Mode
 **************
 
-The core of the Hyperscan streaming runtime API consists of functions to open,
-scan, and close Hyperscan data streams:
+The core of the Vectorscan streaming runtime API consists of functions to open,
+scan, and close Vectorscan data streams:
 
 * :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.
 
@@ -57,14 +57,14 @@ will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
 still call :c:func:`hs_close_stream` to complete the clean-up process for that
 stream.
 
-Streams exist in the Hyperscan library so that pattern matching state can be
+Streams exist in the Vectorscan library so that pattern matching state can be
 maintained across multiple blocks of target data -- without maintaining this
 state, it would not be possible to detect patterns that span these blocks of
 data. This, however, does come at the cost of requiring an amount of storage
 per-stream (the size of this storage is fixed at compile time), and a slight
 performance penalty in some cases to manage the state.
 
-While Hyperscan does always support a strict ordering of multiple matches,
+While Vectorscan does always support a strict ordering of multiple matches,
 streaming matches will not be delivered at offsets before the current stream
 write, with the exception of zero-width asserts, where constructs such as
 :regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
@@ -76,7 +76,7 @@ Stream Management
 =================
 
 In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
-:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
+:c:func:`hs_close_stream`, the Vectorscan API provides a number of other
 functions for the management of streams:
 
 * :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
@@ -98,10 +98,10 @@ A stream object is allocated as a fixed size region of memory which has been
 sized to ensure that no memory allocations are required during scan
 operations. When the system is under memory pressure, it may be useful to reduce
 the memory consumed by streams that are not expected to be used soon. The
-Hyperscan API provides calls for translating a stream to and from a compressed
+Vectorscan API provides calls for translating a stream to and from a compressed
 representation for this purpose. The compressed representation differs from the
 full stream object as it does not reserve space for components which are not
-required given the current stream state. The Hyperscan API functions for this
+required given the current stream state. The Vectorscan API functions for this
 functionality are:
 
 * :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
@@ -157,7 +157,7 @@ scanned in block mode.
 Scratch Space
 *************
 
-While scanning data, Hyperscan needs a small amount of temporary memory to store
+While scanning data, Vectorscan needs a small amount of temporary memory to store
 on-the-fly internal data. This amount is unfortunately too large to fit on the
 stack, particularly for embedded applications, and allocating memory dynamically
 is too expensive, so a pre-allocated "scratch" space must be provided to the
@@ -170,7 +170,7 @@ databases, only a single scratch region is necessary: in this case, calling
 will ensure that the scratch space is large enough to support scanning against
 any of the given databases.
 
-While the Hyperscan library is re-entrant, the use of scratch spaces is not.
+While the Vectorscan library is re-entrant, the use of scratch spaces is not.
 For example, if by design it is deemed necessary to run recursive or nested
 scanning (say, from the match callback function), then an additional scratch
 space is required for that context.
@@ -219,11 +219,11 @@ For example:
 Custom Allocators
 *****************
 
-By default, structures used by Hyperscan at runtime (scratch space, stream
+By default, structures used by Vectorscan at runtime (scratch space, stream
 state, etc) are allocated with the default system allocators, usually
 ``malloc()`` and ``free()``.
 
-The Hyperscan API provides a facility for changing this behaviour to support
+The Vectorscan API provides a facility for changing this behaviour to support
 applications that use custom memory allocators.
 
 These functions are:
diff --git a/doc/dev-reference/serialization.rst b/doc/dev-reference/serialization.rst
index 4f884c75..5950e607 100644
--- a/doc/dev-reference/serialization.rst
+++ b/doc/dev-reference/serialization.rst
@@ -4,7 +4,7 @@
 Serialization
 #############
 
-For some applications, compiling Hyperscan pattern databases immediately prior
+For some applications, compiling Vectorscan pattern databases immediately prior
 to use is not an appropriate design. Some users may wish to:
 
 * Compile pattern databases on a different host;
@@ -14,9 +14,9 @@ to use is not an appropriate design. Some users may wish to:
 
 * Control the region of memory in which the compiled database is located.
 
-Hyperscan pattern databases are not completely flat in memory: they contain
+Vectorscan pattern databases are not completely flat in memory: they contain
 pointers and have specific alignment requirements. Therefore, they cannot be
-copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
+copied (or otherwise relocated) directly. To enable these use cases, Vectorscan
 provides functionality for serializing and deserializing compiled pattern
 databases.
 
@@ -40,10 +40,10 @@ The API provides the following functions:
    returns a string containing information about the database. This call is
    analogous to :c:func:`hs_database_info`.
 
-.. note:: Hyperscan performs both version and platform compatibility checks
+.. note:: Vectorscan performs both version and platform compatibility checks
    upon deserialization. The :c:func:`hs_deserialize_database` and
    :c:func:`hs_deserialize_database_at` functions will only permit the
-   deserialization of databases compiled with (a) the same version of Hyperscan
+   deserialization of databases compiled with (a) the same version of Vectorscan
    and (b) platform features supported by the current host platform. See
    :ref:`instr_specialization` for more information on platform specialization.
 
@@ -51,17 +51,17 @@ The API provides the following functions:
 The Runtime Library
 ===================
 
-The main Hyperscan library (``libhs``) contains both the compiler and runtime
-portions of the library. This means that in order to support the Hyperscan
+The main Vectorscan library (``libhs``) contains both the compiler and runtime
+portions of the library. This means that in order to support the Vectorscan
 compiler, which is written in C++, it requires C++ linkage and has a
 dependency on the C++ standard library.
 
 Many embedded applications require only the scanning ("runtime") portion of the
-Hyperscan library. In these cases, pattern compilation generally takes place on
+Vectorscan library. In these cases, pattern compilation generally takes place on
 another host, and serialized pattern databases are delivered to the application
 for use.
 
 To support these applications without requiring the C++ dependency, a
-runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
+runtime-only version of the Vectorscan library, called ``libhs_runtime``, is also
 distributed. This library does not depend on the C++ standard library and
-provides all Hyperscan functions other that those used to compile databases.
+provides all Vectorscan functions other that those used to compile databases.
diff --git a/doc/dev-reference/tools.rst b/doc/dev-reference/tools.rst
index e0465fc6..f6d51515 100644
--- a/doc/dev-reference/tools.rst
+++ b/doc/dev-reference/tools.rst
@@ -4,14 +4,14 @@
 Tools
 #####
 
-This section describes the set of utilities included with the Hyperscan library.
+This section describes the set of utilities included with the Vectorscan library.
 
 ********************
 Quick Check: hscheck
 ********************
 
-The ``hscheck`` tool allows the user to quickly check whether Hyperscan supports
-a group of patterns. If a pattern is rejected by Hyperscan's compiler, the
+The ``hscheck`` tool allows the user to quickly check whether Vectorscan supports
+a group of patterns. If a pattern is rejected by Vectorscan's compiler, the
 compile error is provided on standard output.
 
 For example, given the following three patterns (the last of which contains a
@@ -34,7 +34,7 @@ syntax error) in a file called ``/tmp/test``::
 Benchmarker: hsbench
 ********************
 
-The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
+The ``hsbench`` tool provides an easy way to measure Vectorscan's performance
 for a particular set of patterns and corpus of data to be scanned.
 
 Patterns are supplied in the format described below in
@@ -44,7 +44,7 @@ easy control of how a corpus is broken into blocks and streams.
 
 .. note:: A group of Python scripts for constructing corpora databases from
    various input types, such as PCAP network traffic captures or text files, can
-   be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
+   be found in the Vectorscan source tree in ``tools/hsbench/scripts``.
 
 Running hsbench
 ===============
@@ -56,7 +56,7 @@ produce output like this::
     $ hsbench -e /tmp/patterns -c /tmp/corpus.db
 
     Signatures:        /tmp/patterns
-    Hyperscan info:    Version: 4.3.1 Features:  AVX2 Mode: STREAM
+    Vectorscan info:    Version: 5.4.11 Features:  AVX2 Mode: STREAM
     Expression count:  200
     Bytecode size:     342,540 bytes
     Database CRC:      0x6cd6b67c
@@ -77,7 +77,7 @@ takes to perform all twenty scans. The number of repeats can be changed with the
 ``-n`` argument, and the results of each scan will be displayed if the
 ``--per-scan`` argument is specified.
 
-To benchmark Hyperscan on more than one core, you can supply a list of cores
+To benchmark Vectorscan on more than one core, you can supply a list of cores
 with the ``-T`` argument, which will instruct ``hsbench`` to start one
 benchmark thread per core given and compute the throughput from the time taken
 to complete all of them.
@@ -91,17 +91,17 @@ Correctness Testing: hscollider
 *******************************
 
 The ``hscollider`` tool, or Pattern Collider, provides a way to verify
-Hyperscan's matching behaviour. It does this by compiling and scanning patterns
+Vectorscan's matching behaviour. It does this by compiling and scanning patterns
 (either singly or in groups) against known corpora and comparing the results
 against another engine (the "ground truth"). Two sources of ground truth for
 comparison are available:
 
  * The PCRE library (http://pcre.org/).
- * An NFA simulation run on Hyperscan's compile-time graph representation. This
+ * An NFA simulation run on Vectorscan's compile-time graph representation. This
    is used if PCRE cannot support the pattern or if PCRE execution fails due to
    a resource limit.
 
-Much of Hyperscan's testing infrastructure is built on ``hscollider``, and the
+Much of Vectorscan's testing infrastructure is built on ``hscollider``, and the
 tool is designed to take advantage of multiple cores and provide considerable
 flexibility in controlling the test. These options are described in the help
 (``hscollider -h``) and include:
@@ -116,11 +116,11 @@ flexibility in controlling the test. These options are described in the help
 Using hscollider to debug a pattern
 ===================================
 
-One common use-case for ``hscollider`` is to determine whether Hyperscan will
+One common use-case for ``hscollider`` is to determine whether Vectorscan will
 match a pattern in the expected location, and whether this accords with PCRE's
 behaviour for the same case.
 
-Here is an example. We put our pattern in a file in Hyperscan's pattern
+Here is an example. We put our pattern in a file in Vectorscan's pattern
 format::
 
     $ cat /tmp/pat
@@ -172,7 +172,7 @@ individual matches are displayed in the output::
 
     Total elapsed time: 0.00522815 secs.
 
-We can see from this output that both PCRE and Hyperscan find matches ending at
+We can see from this output that both PCRE and Vectorscan find matches ending at
 offset 33 and 45, and so ``hscollider`` considers this test case to have
 passed.
 
@@ -180,13 +180,13 @@ passed.
 corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)
 
 .. note:: In default operation, PCRE produces only one match for a scan, unlike
-  Hyperscan's automata semantics. The ``hscollider`` tool uses libpcre's
-  "callout" functionality to match Hyperscan's semantics.
+  Vectorscan's automata semantics. The ``hscollider`` tool uses libpcre's
+  "callout" functionality to match Vectorscan's semantics.
 
 Running a larger scan test
 ==========================
 
-A set of patterns for testing purposes are distributed with Hyperscan, and these
+A set of patterns for testing purposes are distributed with Vectorscan, and these
 can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
 provided to do this easily:
 
@@ -202,10 +202,10 @@ Debugging: hsdump
 *****************
 
 When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
-``Debug``), Hyperscan includes support for dumping information about its
+``Debug``), Vectorscan includes support for dumping information about its
 internals during pattern compilation with the ``hsdump`` tool.
 
-This information is mostly of use to Hyperscan developers familiar with the
+This information is mostly of use to Vectorscan developers familiar with the
 library's internal structure, but can be used to diagnose issues with patterns
 and provide more information in bug reports.
 
@@ -215,7 +215,7 @@ and provide more information in bug reports.
 Pattern Format
 **************
 
-All of the Hyperscan tools accept patterns in the same format, read from plain
+All of the Vectorscan tools accept patterns in the same format, read from plain
 text files with one pattern per line. Each line looks like this:
 
 * ``<integer id>:/<regex>/<flags>``
@@ -227,12 +227,12 @@ For example::
     3:/^.{10,20}hatstand/m
 
 The integer ID is the value that will be reported when a match is found by
-Hyperscan and must be unique.
+Vectorscan and must be unique.
 
 The pattern itself is a regular expression in PCRE syntax; see
 :ref:`compilation` for more information on supported features.
 
-The flags are single characters that map to Hyperscan flags as follows:
+The flags are single characters that map to Vectorscan flags as follows:
 
 =========   =================================    ===========
 Character   API Flag                             Description
@@ -256,7 +256,7 @@ between braces, separated by commas. For example::
 
     1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
 
-All Hyperscan tools will accept a pattern file (or a directory containing
+All Vectorscan tools will accept a pattern file (or a directory containing
 pattern files) with the ``-e`` argument. If no further arguments constraining
 the pattern set are given, all patterns in those files are used.
 

From 0c57b6c89490303757aca3ba2d0515f7f8752765 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Tue, 20 Feb 2024 13:48:05 -0600
Subject: [PATCH 05/56] pkgconfig: Correct library description

Correct the description in the pkgconfig file, but
leave the name alone as we want to remain compatible
with projects utilizing hyperscan.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 libhs.pc.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libhs.pc.in b/libhs.pc.in
index 3ad2b90c..d1e3ffb0 100644
--- a/libhs.pc.in
+++ b/libhs.pc.in
@@ -4,7 +4,7 @@ libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
 includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
 
 Name: libhs
-Description: Intel(R) Hyperscan Library
+Description: A portable fork of the high-performance regular expression matching library
 Version: @HS_VERSION@
 Libs: -L${libdir} -lhs
 Cflags: -I${includedir}/hs

From 6bbd4821f0ad090a68d6e3dfffbc3dc9ad5d4da1 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Tue, 20 Feb 2024 15:01:40 -0600
Subject: [PATCH 06/56] hsbench: Update test program output

While fixing the documentation, it was noticed that the hsbench
output was still referring to the project as Hyperscan.
Lets correct it.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 tools/hsbench/engine_hyperscan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 95461de5..f3de35ef 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -248,7 +248,7 @@ void EngineHyperscan::printStats() const {
         printf("Signature set:        %s\n", compile_stats.sigs_name.c_str());
     }
     printf("Signatures:        %s\n", compile_stats.signatures.c_str());
-    printf("Hyperscan info:    %s\n", compile_stats.db_info.c_str());
+    printf("Vectorscan info:    %s\n", compile_stats.db_info.c_str());
     printf("Expression count:  %'zu\n", compile_stats.expressionCount);
     printf("Bytecode size:     %'zu bytes\n", compile_stats.compiledSize);
     printf("Database CRC:      0x%x\n", compile_stats.crc32);

From f9e254ab415a22ae59bab86f235784cfbf2572d4 Mon Sep 17 00:00:00 2001
From: Yoan Picchi <yoan.picchi@arm.com>
Date: Thu, 15 Feb 2024 13:51:19 +0000
Subject: [PATCH 07/56] Enable sheng32/64 for SVE

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 src/nfa/sheng.c          |   8 +-
 src/nfa/sheng.h          |   8 +-
 src/nfa/sheng_defs.h     |  70 +++----
 src/nfa/sheng_impl.h     | 127 ++++++++++++
 src/nfa/sheng_impl4.h    | 428 +++++++++++++++++++++++++++++++++++++++
 src/nfa/shengcompile.cpp |  14 ++
 6 files changed, 612 insertions(+), 43 deletions(-)

diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index 3f36e218..922e8f80 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 // Sheng32
 static really_inline
 const struct sheng32 *get_sheng32(const struct NFA *n) {
@@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
     }
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || HAVE_SVE
 
 /* include Sheng function definitions */
 #include "sheng_defs.h"
@@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
     return 0;
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 // Sheng32
 static really_inline
 char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
@@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u8 *)dest = *(const u8 *)src;
     return 0;
 }
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || HAVE_SVE
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 7b90e303..212bd3a4 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
@@ -106,8 +106,7 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
 
 char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
                       size_t length, NfaCallback cb, void *context);
-
-#else // !HAVE_AVX512VBMI
+#else // !HAVE_AVX512VBMI && !HAVE_SVE
 
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
@@ -138,6 +137,7 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
 #define nfaExecSheng64_testEOD NFA_API_NO_IMPL
 #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
 #define nfaExecSheng64_B NFA_API_NO_IMPL
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE)
+
 
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
index 390af752..886af28e 100644
--- a/src/nfa/sheng_defs.h
+++ b/src/nfa/sheng_defs.h
@@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
     return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 static really_inline
 u8 isDeadState32(const u8 a) {
     return a & SHENG32_STATE_DEAD;
@@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_cod
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_cod
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_co
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_co
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_samd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_samd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_sam
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_sam
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nmd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_nmd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 dummyFunc
@@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nm
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_nm
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 dummyFunc
@@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_coda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_cod
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_coa
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_co
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_samda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_samd
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_sama
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_sam
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC isAccelState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nmda
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nmd
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nm
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 1fa5c831..9634fa65 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -96,6 +96,133 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
+#if defined(HAVE_SVE)
+
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+    svuint8_t cur_state = svld1(lane_pred_32, state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c));
+        cur_state = svtbl(cur_state, succ_mask);
+        const u8 tmp = svlastb(lane_pred_32, cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
+                     tmp & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC32(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports32(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = svlastb(lane_pred_32, cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+    svuint8_t cur_state = svld1(lane_pred_64, state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c));
+        cur_state = svtbl(cur_state, succ_mask);
+        const u8 tmp = svlastb(lane_pred_64, cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
+                     tmp & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC64(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports64(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = svlastb(lane_pred_64, cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif
+
 #if defined(HAVE_AVX512VBMI)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index e5d3468f..10ad4ea0 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -283,6 +283,434 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
+#if defined(HAVE_SVE)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux =
+            get_accel32(s, *state & SHENG32_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+    svuint8_t cur_state = svld1(lane_pred_32, state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+        svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1));
+        cur_state = svtbl(cur_state, succ_mask1);
+        const u8 a1 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2));
+        cur_state = svtbl(cur_state, succ_mask2);
+        const u8 a2 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3));
+        cur_state = svtbl(cur_state, succ_mask3);
+        const u8 a3 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4));
+        cur_state = svtbl(cur_state, succ_mask4);
+        const u8 a4 = svlastb(lane_pred_32, cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
+                     a1 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
+                     a2 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
+                     a3 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
+                     a4 & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC32(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC32(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel32(s, a4 & SHENG32_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC32(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux =
+                get_accel32(s, a4 & SHENG32_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = svlastb(lane_pred_32, cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#if !defined(NO_SHENG64_IMPL)
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+    svuint8_t cur_state = svld1(lane_pred_64, state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1));
+        cur_state = svtbl(cur_state, succ_mask1);
+        const u8 a1 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2));
+        cur_state = svtbl(cur_state, succ_mask2);
+        const u8 a2 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3));
+        cur_state = svtbl(cur_state, succ_mask3);
+        const u8 a3 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4));
+        cur_state = svtbl(cur_state, succ_mask4);
+        const u8 a4 = svlastb(lane_pred_64, cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
+                     a1 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
+                     a2 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
+                     a3 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
+                     a4 & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC64(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC64(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+        }
+        if (OUTER_DEAD_FUNC64(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        }
+        cur_buf += 4;
+    }
+    *state = svlastb(lane_pred_64, cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif
+#endif
+
 #if defined(HAVE_AVX512VBMI)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 055e1971..0f93e139 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -730,10 +730,17 @@ bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
+#ifdef HAVE_SVE
+    if (svcntb()<32) {
+        DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n");
+        return nullptr;
+    }
+#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
+#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);
@@ -762,10 +769,17 @@ bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
+#ifdef HAVE_SVE
+    if (svcntb()<64) {
+        DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n");
+        return nullptr;
+    }
+#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
+#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);

From f5412b3509082a3278fd95a3bb0247916d4c0823 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@users.noreply.github.com>
Date: Tue, 19 Mar 2024 11:40:23 +0200
Subject: [PATCH 08/56] Revert "RFC Enable sheng32/64 for SVE"

---
 src/nfa/sheng.c          |   8 +-
 src/nfa/sheng.h          |   8 +-
 src/nfa/sheng_defs.h     |  70 +++----
 src/nfa/sheng_impl.h     | 127 ------------
 src/nfa/sheng_impl4.h    | 428 ---------------------------------------
 src/nfa/shengcompile.cpp |  14 --
 6 files changed, 43 insertions(+), 612 deletions(-)

diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index 922e8f80..3f36e218 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
 
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 // Sheng32
 static really_inline
 const struct sheng32 *get_sheng32(const struct NFA *n) {
@@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
     }
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
-#endif // end of HAVE_AVX512VBMI || HAVE_SVE
+#endif // end of HAVE_AVX512VBMI
 
 /* include Sheng function definitions */
 #include "sheng_defs.h"
@@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
     return 0;
 }
 
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 // Sheng32
 static really_inline
 char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
@@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u8 *)dest = *(const u8 *)src;
     return 0;
 }
-#endif // end of HAVE_AVX512VBMI || HAVE_SVE
+#endif // end of HAVE_AVX512VBMI
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 212bd3a4..7b90e303 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
@@ -106,7 +106,8 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
 
 char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
                       size_t length, NfaCallback cb, void *context);
-#else // !HAVE_AVX512VBMI && !HAVE_SVE
+
+#else // !HAVE_AVX512VBMI
 
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
@@ -137,7 +138,6 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
 #define nfaExecSheng64_testEOD NFA_API_NO_IMPL
 #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
 #define nfaExecSheng64_B NFA_API_NO_IMPL
-#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE)
-
+#endif // end of HAVE_AVX512VBMI
 
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
index 886af28e..390af752 100644
--- a/src/nfa/sheng_defs.h
+++ b/src/nfa/sheng_defs.h
@@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
     return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
 }
 
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 static really_inline
 u8 isDeadState32(const u8 a) {
     return a & SHENG32_STATE_DEAD;
@@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_cod
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_cod
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_co
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_co
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_samd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_samd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_sam
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_sam
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nmd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_nmd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 dummyFunc
@@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nm
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_nm
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 dummyFunc
@@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_coda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_cod
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_coa
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_co
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_samda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_samd
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_sama
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_sam
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC isAccelState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_nmda
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_nmd
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #define SHENG32_IMPL sheng32_4_nm
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+#if defined(HAVE_AVX512VBMI)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 9634fa65..1fa5c831 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -96,133 +96,6 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
-#if defined(HAVE_SVE)
-
-static really_inline
-char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
-                  const struct sheng32 *s,
-                  u8 *const cached_accept_state,
-                  ReportID *const cached_accept_id,
-                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
-                  const u8 *end, const u8 **scan_end) {
-    DEBUG_PRINTF("Starting DFA execution in state %u\n",
-                 *state & SHENG32_STATE_MASK);
-    const u8 *cur_buf = start;
-    if (DEAD_FUNC32(*state)) {
-        DEBUG_PRINTF("Dead on arrival\n");
-        *scan_end = end;
-        return MO_CONTINUE_MATCHING;
-    }
-    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
-
-    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
-    svuint8_t cur_state = svld1(lane_pred_32, state);
-    const m512 *masks = s->succ_masks;
-
-    while (likely(cur_buf != end)) {
-        const u8 c = *cur_buf;
-        svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c));
-        cur_state = svtbl(cur_state, succ_mask);
-        const u8 tmp = svlastb(lane_pred_32, cur_state);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
-                     tmp & SHENG32_STATE_FLAG_MASK);
-
-        if (unlikely(ACCEPT_FUNC32(tmp))) {
-            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
-            u64a match_offset = base_offset + (cur_buf - buf) + 1;
-            DEBUG_PRINTF("Match @ %llu\n", match_offset);
-            if (STOP_AT_MATCH) {
-                DEBUG_PRINTF("Stopping at match @ %lli\n",
-                             (u64a)(cur_buf - start));
-                *state = tmp;
-                *scan_end = cur_buf;
-                return MO_MATCHES_PENDING;
-            }
-            if (single) {
-                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                    MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
-                }
-            } else {
-                if (fireReports32(s, cb, ctxt, tmp, match_offset,
-                                  cached_accept_state, cached_accept_id,
-                                  0) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
-                }
-            }
-        }
-        cur_buf++;
-    }
-    *state = svlastb(lane_pred_32, cur_state);
-    *scan_end = cur_buf;
-    return MO_CONTINUE_MATCHING;
-}
-
-static really_inline
-char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
-                  const struct sheng64 *s,
-                  u8 *const cached_accept_state,
-                  ReportID *const cached_accept_id,
-                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
-                  const u8 *end, const u8 **scan_end) {
-    DEBUG_PRINTF("Starting DFA execution in state %u\n",
-                 *state & SHENG64_STATE_MASK);
-    const u8 *cur_buf = start;
-    if (DEAD_FUNC64(*state)) {
-        DEBUG_PRINTF("Dead on arrival\n");
-        *scan_end = end;
-        return MO_CONTINUE_MATCHING;
-    }
-    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
-
-    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
-    svuint8_t cur_state = svld1(lane_pred_64, state);
-    const m512 *masks = s->succ_masks;
-
-    while (likely(cur_buf != end)) {
-        const u8 c = *cur_buf;
-        svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c));
-        cur_state = svtbl(cur_state, succ_mask);
-        const u8 tmp = svlastb(lane_pred_64, cur_state);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
-                     tmp & SHENG64_STATE_FLAG_MASK);
-
-        if (unlikely(ACCEPT_FUNC64(tmp))) {
-            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
-            u64a match_offset = base_offset + (cur_buf - buf) + 1;
-            DEBUG_PRINTF("Match @ %llu\n", match_offset);
-            if (STOP_AT_MATCH) {
-                DEBUG_PRINTF("Stopping at match @ %lli\n",
-                             (u64a)(cur_buf - start));
-                *state = tmp;
-                *scan_end = cur_buf;
-                return MO_MATCHES_PENDING;
-            }
-            if (single) {
-                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                    MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
-                }
-            } else {
-                if (fireReports64(s, cb, ctxt, tmp, match_offset,
-                                  cached_accept_state, cached_accept_id,
-                                  0) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
-                }
-            }
-        }
-        cur_buf++;
-    }
-    *state = svlastb(lane_pred_64, cur_state);
-    *scan_end = cur_buf;
-    return MO_CONTINUE_MATCHING;
-}
-#endif
-
 #if defined(HAVE_AVX512VBMI)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index 10ad4ea0..e5d3468f 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -283,434 +283,6 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
-#if defined(HAVE_SVE)
-static really_inline
-char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
-                  const struct sheng32 *s,
-                  u8 *const cached_accept_state,
-                  ReportID *const cached_accept_id,
-                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
-                  const u8 *end, const u8 **scan_end) {
-    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
-                 *state & SHENG32_STATE_MASK);
-    const u8 *cur_buf = start;
-    const u8 *min_accel_dist = start;
-    base_offset++;
-    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
-
-    if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
-        DEBUG_PRINTF("Accel state reached @ 0\n");
-        const union AccelAux *aaux =
-            get_accel32(s, *state & SHENG32_STATE_MASK);
-        const u8 *new_offset = run_accel(aaux, cur_buf, end);
-        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
-            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
-        } else {
-            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
-        }
-        DEBUG_PRINTF("Next accel chance: %llu\n",
-                     (u64a)(min_accel_dist - start));
-        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
-        cur_buf = new_offset;
-        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
-    }
-    if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
-        DEBUG_PRINTF("Dead on arrival\n");
-        *scan_end = end;
-        return MO_CONTINUE_MATCHING;
-    }
-
-    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
-    svuint8_t cur_state = svld1(lane_pred_32, state);
-    const m512 *masks = s->succ_masks;
-
-    while (likely(end - cur_buf >= 4)) {
-        const u8 *b1 = cur_buf;
-        const u8 *b2 = cur_buf + 1;
-        const u8 *b3 = cur_buf + 2;
-        const u8 *b4 = cur_buf + 3;
-        const u8 c1 = *b1;
-        const u8 c2 = *b2;
-        const u8 c3 = *b3;
-        const u8 c4 = *b4;
-        svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1));
-        cur_state = svtbl(cur_state, succ_mask1);
-        const u8 a1 = svlastb(lane_pred_32, cur_state);
-
-        svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2));
-        cur_state = svtbl(cur_state, succ_mask2);
-        const u8 a2 = svlastb(lane_pred_32, cur_state);
-
-        svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3));
-        cur_state = svtbl(cur_state, succ_mask3);
-        const u8 a3 = svlastb(lane_pred_32, cur_state);
-
-        svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4));
-        cur_state = svtbl(cur_state, succ_mask4);
-        const u8 a4 = svlastb(lane_pred_32, cur_state);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
-                     a1 & SHENG32_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
-                     a2 & SHENG32_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
-                     a3 & SHENG32_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
-                     a4 & SHENG32_STATE_FLAG_MASK);
-
-        if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
-            if (ACCEPT_FUNC32(a1)) {
-                u64a match_offset = base_offset + b1 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a1 & SHENG32_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b1 - start));
-                    *scan_end = b1;
-                    *state = a1;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports32(s, cb, ctxt, a1, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC32(a2)) {
-                u64a match_offset = base_offset + b2 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a2 & SHENG32_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b2 - start));
-                    *scan_end = b2;
-                    *state = a2;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports32(s, cb, ctxt, a2, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC32(a3)) {
-                u64a match_offset = base_offset + b3 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a3 & SHENG32_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b3 - start));
-                    *scan_end = b3;
-                    *state = a3;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports32(s, cb, ctxt, a3, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC32(a4)) {
-                u64a match_offset = base_offset + b4 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a4 & SHENG32_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b4 - start));
-                    *scan_end = b4;
-                    *state = a4;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports32(s, cb, ctxt, a4, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (INNER_DEAD_FUNC32(a4)) {
-                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
-                *scan_end = end;
-                *state = a4;
-                return MO_CONTINUE_MATCHING;
-            }
-            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
-                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
-                const union AccelAux *aaux =
-                    get_accel32(s, a4 & SHENG32_STATE_MASK);
-                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
-                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
-                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
-                } else {
-                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
-                }
-                DEBUG_PRINTF("Next accel chance: %llu\n",
-                             (u64a)(min_accel_dist - start));
-                DEBUG_PRINTF("Accel scanned %llu bytes\n",
-                             (u64a)(new_offset - cur_buf - 4));
-                cur_buf = new_offset;
-                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
-                continue;
-            }
-        }
-        if (OUTER_DEAD_FUNC32(a4)) {
-            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
-            *scan_end = end;
-            *state = a4;
-            return MO_CONTINUE_MATCHING;
-        };
-        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
-            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
-            const union AccelAux *aaux =
-                get_accel32(s, a4 & SHENG32_STATE_MASK);
-            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
-            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
-                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
-            } else {
-                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
-            }
-            DEBUG_PRINTF("Next accel chance: %llu\n",
-                         (u64a)(min_accel_dist - start));
-            DEBUG_PRINTF("Accel scanned %llu bytes\n",
-                         (u64a)(new_offset - cur_buf - 4));
-            cur_buf = new_offset;
-            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
-            continue;
-        };
-        cur_buf += 4;
-    }
-    *state = svlastb(lane_pred_32, cur_state);
-    *scan_end = cur_buf;
-    return MO_CONTINUE_MATCHING;
-}
-
-#if !defined(NO_SHENG64_IMPL)
-static really_inline
-char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
-                  const struct sheng64 *s,
-                  u8 *const cached_accept_state,
-                  ReportID *const cached_accept_id,
-                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
-                  const u8 *end, const u8 **scan_end) {
-    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
-                 *state & SHENG64_STATE_MASK);
-    const u8 *cur_buf = start;
-    base_offset++;
-    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
-
-    if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
-        DEBUG_PRINTF("Dead on arrival\n");
-        *scan_end = end;
-        return MO_CONTINUE_MATCHING;
-    }
-
-    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
-    svuint8_t cur_state = svld1(lane_pred_64, state);
-    const m512 *masks = s->succ_masks;
-
-    while (likely(end - cur_buf >= 4)) {
-        const u8 *b1 = cur_buf;
-        const u8 *b2 = cur_buf + 1;
-        const u8 *b3 = cur_buf + 2;
-        const u8 *b4 = cur_buf + 3;
-        const u8 c1 = *b1;
-        const u8 c2 = *b2;
-        const u8 c3 = *b3;
-        const u8 c4 = *b4;
-
-        svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1));
-        cur_state = svtbl(cur_state, succ_mask1);
-        const u8 a1 = svlastb(lane_pred_64, cur_state);
-
-        svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2));
-        cur_state = svtbl(cur_state, succ_mask2);
-        const u8 a2 = svlastb(lane_pred_64, cur_state);
-
-        svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3));
-        cur_state = svtbl(cur_state, succ_mask3);
-        const u8 a3 = svlastb(lane_pred_64, cur_state);
-
-        svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4));
-        cur_state = svtbl(cur_state, succ_mask4);
-        const u8 a4 = svlastb(lane_pred_64, cur_state);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
-                     a1 & SHENG64_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
-                     a2 & SHENG64_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
-                     a3 & SHENG64_STATE_FLAG_MASK);
-
-        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
-        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
-                     a4 & SHENG64_STATE_FLAG_MASK);
-
-        if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
-            if (ACCEPT_FUNC64(a1)) {
-                u64a match_offset = base_offset + b1 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a1 & SHENG64_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b1 - start));
-                    *scan_end = b1;
-                    *state = a1;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports64(s, cb, ctxt, a1, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC64(a2)) {
-                u64a match_offset = base_offset + b2 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a2 & SHENG64_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b2 - start));
-                    *scan_end = b2;
-                    *state = a2;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports64(s, cb, ctxt, a2, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC64(a3)) {
-                u64a match_offset = base_offset + b3 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a3 & SHENG64_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b3 - start));
-                    *scan_end = b3;
-                    *state = a3;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports64(s, cb, ctxt, a3, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (ACCEPT_FUNC64(a4)) {
-                u64a match_offset = base_offset + b4 - buf;
-                DEBUG_PRINTF("Accept state %u reached\n",
-                             a4 & SHENG64_STATE_MASK);
-                DEBUG_PRINTF("Match @ %llu\n", match_offset);
-                if (STOP_AT_MATCH) {
-                    DEBUG_PRINTF("Stopping at match @ %lli\n",
-                                 (s64a)(b4 - start));
-                    *scan_end = b4;
-                    *state = a4;
-                    return MO_MATCHES_PENDING;
-                }
-                if (single) {
-                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
-                        MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else {
-                    if (fireReports64(s, cb, ctxt, a4, match_offset,
-                                      cached_accept_state, cached_accept_id,
-                                      0) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                }
-            }
-            if (INNER_DEAD_FUNC64(a4)) {
-                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
-                *scan_end = end;
-                *state = a4;
-                return MO_CONTINUE_MATCHING;
-            }
-        }
-        if (OUTER_DEAD_FUNC64(a4)) {
-            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
-            *scan_end = end;
-            *state = a4;
-            return MO_CONTINUE_MATCHING;
-        }
-        cur_buf += 4;
-    }
-    *state = svlastb(lane_pred_64, cur_state);
-    *scan_end = cur_buf;
-    return MO_CONTINUE_MATCHING;
-}
-#endif
-#endif
-
 #if defined(HAVE_AVX512VBMI)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 0f93e139..055e1971 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -730,17 +730,10 @@ bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
-#ifdef HAVE_SVE
-    if (svcntb()<32) {
-        DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n");
-        return nullptr;
-    }
-#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
-#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);
@@ -769,17 +762,10 @@ bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
-#ifdef HAVE_SVE
-    if (svcntb()<64) {
-        DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n");
-        return nullptr;
-    }
-#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
-#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);

From 50a62a17ffbfdbf4a9a30010bec61ebc271d95f8 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Mon, 1 Apr 2024 16:05:13 +0300
Subject: [PATCH 09/56] changed color output to csv output

---
 benchmarks/benchmarks.cpp | 168 ++++++++++++++++++++++----------------
 1 file changed, 97 insertions(+), 71 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index 91cab3f8..c6e453ef 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -26,32 +26,30 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <iostream>
 #include <chrono>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
-#include <cstdlib>
-#include <memory>
 #include <functional>
+#include <iostream>
+#include <memory>
 
 #include "benchmarks.hpp"
 
-#define MAX_LOOPS    1000000000
-#define MAX_MATCHES  5
-#define N            8
+#define MAX_LOOPS 1000000000
+#define MAX_MATCHES 5
+#define N 8
 
 struct hlmMatchEntry {
     size_t to;
     u32 id;
-    hlmMatchEntry(size_t end, u32 identifier) :
-            to(end), id(identifier) {}
+    hlmMatchEntry(size_t end, u32 identifier) : to(end), id(identifier) {}
 };
 
 std::vector<hlmMatchEntry> ctxt;
 
-static
-hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
-                              UNUSED struct hs_scratch *scratch) {
+static hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
+                                     UNUSED struct hs_scratch *scratch) {
     DEBUG_PRINTF("match @%zu = %u\n", to, id);
 
     ctxt.push_back(hlmMatchEntry(to, id));
@@ -59,10 +57,12 @@ hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
     return HWLM_CONTINUE_MATCHING;
 }
 
-template<typename InitFunc, typename BenchFunc>
-static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse, MicroBenchmark &bench, InitFunc &&init, BenchFunc &&func) {
+template <typename InitFunc, typename BenchFunc>
+static void run_benchmarks(int size, int loops, int max_matches,
+                           bool is_reverse, MicroBenchmark &bench,
+                           InitFunc &&init, BenchFunc &&func) {
     init(bench);
-    double total_sec = 0.0;            
+    double total_sec = 0.0;
     u64a total_size = 0;
     double bw = 0.0;
     double avg_bw = 0.0;
@@ -70,29 +70,31 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
     double avg_time = 0.0;
     if (max_matches) {
         int pos = 0;
-        for(int j = 0; j < max_matches - 1; j++) {
+        for (int j = 0; j < max_matches - 1; j++) {
             bench.buf[pos] = 'b';
-            pos = (j+1) *size / max_matches ;
+            pos = (j + 1) * size / max_matches;
             bench.buf[pos] = 'a';
             u64a actual_size = 0;
             auto start = std::chrono::steady_clock::now();
-            for(int i = 0; i < loops; i++) { 
+            for (int i = 0; i < loops; i++) {
                 const u8 *res = func(bench);
-		if (is_reverse)
-		   actual_size += bench.buf.data() + size - res;
-		else
-                   actual_size += res - bench.buf.data();
+                if (is_reverse)
+                    actual_size += bench.buf.data() + size - res;
+                else
+                    actual_size += res - bench.buf.data();
             }
             auto end = std::chrono::steady_clock::now();
-            double dt = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+            double dt = std::chrono::duration_cast<std::chrono::microseconds>(
+                            end - start)
+                            .count();
             total_sec += dt;
             /*convert microseconds to seconds*/
             /*calculate bandwidth*/
-            bw  = (actual_size / dt) * 1000000.0 / 1048576.0;
-	    /*std::cout << "act_size = " << act_size << std::endl;
-	    std::cout << "dt = " << dt << std::endl;
-	    std::cout << "bw = " << bw << std::endl;*/
-	    avg_bw += bw;
+            bw = (actual_size / dt) * 1000000.0 / 1048576.0;
+            /*std::cout << "act_size = " << act_size << std::endl;
+            std::cout << "dt = " << dt << std::endl;
+            std::cout << "bw = " << bw << std::endl;*/
+            avg_bw += bw;
             /*convert to MB/s*/
             max_bw = std::max(bw, max_bw);
             /*calculate average time*/
@@ -100,18 +102,28 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
         }
         avg_time /= max_matches;
         avg_bw /= max_matches;
-	total_sec /= 1000000.0;
+        total_sec /= 1000000.0;
         /*convert average time to us*/
-        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
-               KBLU "average time per call =" RST " %.3f μs," KBLU " max bandwidth = " RST " %.3f MB/s," KBLU " average bandwidth =" RST " %.3f MB/s \n",
-               bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
+        /* Keeping the color output
+        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU
+                    " total elapsed time =" RST " %.3f s, " KBLU
+                    "average time per call =" RST " %.3f μs," KBLU
+                    " max bandwidth = " RST " %.3f MB/s," KBLU
+                    " average bandwidth =" RST " %.3f MB/s \n",
+               bench.label, max_matches, size, loops, total_sec, avg_time,
+               max_bw, avg_bw);
+        */
+        printf("%s,%u,%u,%u,%.3f,%.3f,%.3f,%.3f\n", bench.label, max_matches,
+               size, loops, total_sec, avg_time, max_bw, avg_bw);
     } else {
         auto start = std::chrono::steady_clock::now();
         for (int i = 0; i < loops; i++) {
             const u8 *res = func(bench);
         }
         auto end = std::chrono::steady_clock::now();
-        total_sec += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        total_sec +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                .count();
         /*calculate transferred size*/
         total_size = size * loops;
         /*calculate average time*/
@@ -122,117 +134,131 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
         max_bw = total_size / total_sec;
         /*convert to MB/s*/
         max_bw /= 1048576.0;
-        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
-               KBLU "average time per call =" RST " %.3f μs ," KBLU " bandwidth = " RST " %.3f MB/s \n",
-               bench.label, size ,loops, total_sec, avg_time, max_bw );
+        /*Keeping the color output
+        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed
+        time =" RST " %.3f s, " KBLU "average time per call =" RST " %.3f μs ,"
+        KBLU " bandwidth = " RST " %.3f MB/s \n", bench.label, size ,loops,
+        total_sec, avg_time, max_bw );
+        */
+        printf("%s,0,%u,%u,%.3f,%.3f,%.3f,0\n", bench.label, size, loops,
+               total_sec, avg_time, max_bw);
     }
 }
 
-int main(){
+int main() {
     int matches[] = {0, MAX_MATCHES};
     std::vector<size_t> sizes;
-    for (size_t i = 0; i < N; i++) sizes.push_back(16000 << i*2);
-    const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa"; 
-  
+    for (size_t i = 0; i < N; i++)
+        sizes.push_back(16000 << i * 2);
+    const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
+    printf("Bench Label, max_matches, size,loops, total_sec, avg_time, "
+           "max_bw, avg_bw\n");
     for (int m = 0; m < 2; m++) {
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Shufti", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return shuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return shuftiExec(b.lo, b.hi, b.buf.data(),
+                                      b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Reverse Shufti", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return rshuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return rshuftiExec(b.lo, b.hi, b.buf.data(),
+                                       b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Truffle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return truffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return truffleExec(b.lo, b.hi, b.buf.data(),
+                                       b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Reverse Truffle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return rtruffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return rtruffleExec(b.lo, b.hi, b.buf.data(),
+                                        b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Vermicelli", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return vermicelliExec('a', 'b', b.buf.data(),
+                                          b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                 [&](MicroBenchmark &b) {
                     b.chars.set('a');
                     ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
                     memset(b.buf.data(), 'b', b.size);
                 },
                 [&](MicroBenchmark &b) {
-                    return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                    return rvermicelliExec('a', 'b', b.buf.data(),
+                                           b.buf.data() + b.size);
+                });
         }
 
         for (size_t i = 0; i < std::size(sizes); i++) {
-            //we imitate the noodle unit tests
+            // we imitate the noodle unit tests
             std::string str;
             const size_t char_len = 5;
             str.resize(char_len + 1);
-            for (size_t j=0; j < char_len; j++) {
-                srand (time(NULL));
-                int key = rand() % + 36 ;
+            for (size_t j = 0; j < char_len; j++) {
+                srand(time(NULL));
+                int key = rand() % +36;
                 str[char_len] = charset[key];
                 str[char_len + 1] = '\0';
             }
 
             MicroBenchmark bench("Noodle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                 [&](MicroBenchmark &b) {
                     ctxt.clear();
                     memset(b.buf.data(), 'a', b.size);
@@ -242,10 +268,10 @@ int main(){
                     assert(b.nt != nullptr);
                 },
                 [&](MicroBenchmark &b) {
-                    noodExec(b.nt.get(), b.buf.data(), b.size, 0, hlmSimpleCallback, &b.scratch);
+                    noodExec(b.nt.get(), b.buf.data(), b.size, 0,
+                             hlmSimpleCallback, &b.scratch);
                     return b.buf.data() + b.size;
-                }
-           );
+                });
         }
     }
 

From b5a29155e4d4dee44f94ec44622bf8431e676ce4 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 2 Apr 2024 11:28:00 +0300
Subject: [PATCH 10/56] removed color output code

---
 benchmarks/benchmarks.cpp | 15 -------------
 benchmarks/benchmarks.hpp | 46 +++++++++++++++------------------------
 2 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index c6e453ef..14cccc10 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -104,15 +104,6 @@ static void run_benchmarks(int size, int loops, int max_matches,
         avg_bw /= max_matches;
         total_sec /= 1000000.0;
         /*convert average time to us*/
-        /* Keeping the color output
-        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU
-                    " total elapsed time =" RST " %.3f s, " KBLU
-                    "average time per call =" RST " %.3f μs," KBLU
-                    " max bandwidth = " RST " %.3f MB/s," KBLU
-                    " average bandwidth =" RST " %.3f MB/s \n",
-               bench.label, max_matches, size, loops, total_sec, avg_time,
-               max_bw, avg_bw);
-        */
         printf("%s,%u,%u,%u,%.3f,%.3f,%.3f,%.3f\n", bench.label, max_matches,
                size, loops, total_sec, avg_time, max_bw, avg_bw);
     } else {
@@ -134,12 +125,6 @@ static void run_benchmarks(int size, int loops, int max_matches,
         max_bw = total_size / total_sec;
         /*convert to MB/s*/
         max_bw /= 1048576.0;
-        /*Keeping the color output
-        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed
-        time =" RST " %.3f s, " KBLU "average time per call =" RST " %.3f μs ,"
-        KBLU " bandwidth = " RST " %.3f MB/s \n", bench.label, size ,loops,
-        total_sec, avg_time, max_bw );
-        */
         printf("%s,0,%u,%u,%.3f,%.3f,%.3f,0\n", bench.label, size, loops,
                total_sec, avg_time, max_bw);
     }
diff --git a/benchmarks/benchmarks.hpp b/benchmarks/benchmarks.hpp
index 974d2234..13f66fa5 100644
--- a/benchmarks/benchmarks.hpp
+++ b/benchmarks/benchmarks.hpp
@@ -26,44 +26,32 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "hwlm/hwlm_literal.h"
+#include "hwlm/noodle_build.h"
+#include "hwlm/noodle_engine.h"
+#include "hwlm/noodle_internal.h"
 #include "nfa/shufti.h"
 #include "nfa/shufticompile.h"
 #include "nfa/truffle.h"
 #include "nfa/trufflecompile.h"
 #include "nfa/vermicelli.hpp"
-#include "hwlm/noodle_build.h"
-#include "hwlm/noodle_engine.h"
-#include "hwlm/noodle_internal.h"
-#include "hwlm/hwlm_literal.h"
-#include "util/bytecode_ptr.h"
 #include "scratch.h"
+#include "util/bytecode_ptr.h"
 
-/*define colour control characters*/
-#define RST  "\x1B[0m"
-#define KRED  "\x1B[31m"
-#define KGRN  "\x1B[32m"
-#define KYEL  "\x1B[33m"
-#define KBLU  "\x1B[34m"
-#define KMAG  "\x1B[35m"
-#define KCYN  "\x1B[36m"
-#define KWHT  "\x1B[37m"
-
-class MicroBenchmark
-{
+class MicroBenchmark {
 public:
-  char const *label;
-  size_t size;
+    char const *label;
+    size_t size;
 
-  // Shufti/Truffle
-  m128 lo, hi;
-  ue2::CharReach chars;
-  std::vector<u8> buf;
+    // Shufti/Truffle
+    m128 lo, hi;
+    ue2::CharReach chars;
+    std::vector<u8> buf;
 
-  // Noodle
-  struct hs_scratch scratch;
-  ue2::bytecode_ptr<noodTable> nt;
+    // Noodle
+    struct hs_scratch scratch;
+    ue2::bytecode_ptr<noodTable> nt;
 
-  MicroBenchmark(char const *label_, size_t size_)
-  :label(label_), size(size_), buf(size_) {
-  };
+    MicroBenchmark(char const *label_, size_t size_)
+        : label(label_), size(size_), buf(size_){};
 };

From 62a275e5764efeb8202af18e80174f210b9a3993 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 2 Apr 2024 13:32:51 +0300
Subject: [PATCH 11/56] change first column name csv

---
 benchmarks/benchmarks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index 14cccc10..fd7aed47 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -136,7 +136,7 @@ int main() {
     for (size_t i = 0; i < N; i++)
         sizes.push_back(16000 << i * 2);
     const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
-    printf("Bench Label, max_matches, size,loops, total_sec, avg_time, "
+    printf("Matcher, max_matches, size,loops, total_sec, avg_time, "
            "max_bw, avg_bw\n");
     for (int m = 0; m < 2; m++) {
         for (size_t i = 0; i < std::size(sizes); i++) {

From 3670e52c873e5631871030e3559111ea0d3529a3 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 2 Apr 2024 14:56:27 +0300
Subject: [PATCH 12/56] output tabulated and csv

---
 benchmarks/benchmarks.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index fd7aed47..b6106ed4 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -104,8 +104,9 @@ static void run_benchmarks(int size, int loops, int max_matches,
         avg_bw /= max_matches;
         total_sec /= 1000000.0;
         /*convert average time to us*/
-        printf("%s,%u,%u,%u,%.3f,%.3f,%.3f,%.3f\n", bench.label, max_matches,
-               size, loops, total_sec, avg_time, max_bw, avg_bw);
+        printf("%-18s, %-12u, %-10u, %-6u, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
+               bench.label, max_matches, size, loops, total_sec, avg_time,
+               max_bw, avg_bw);
     } else {
         auto start = std::chrono::steady_clock::now();
         for (int i = 0; i < loops; i++) {
@@ -125,8 +126,8 @@ static void run_benchmarks(int size, int loops, int max_matches,
         max_bw = total_size / total_sec;
         /*convert to MB/s*/
         max_bw /= 1048576.0;
-        printf("%s,0,%u,%u,%.3f,%.3f,%.3f,0\n", bench.label, size, loops,
-               total_sec, avg_time, max_bw);
+        printf("%-18s, %-12s, %-10u, %-6u, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
+               bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
     }
 }
 
@@ -136,8 +137,9 @@ int main() {
     for (size_t i = 0; i < N; i++)
         sizes.push_back(16000 << i * 2);
     const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
-    printf("Matcher, max_matches, size,loops, total_sec, avg_time, "
-           "max_bw, avg_bw\n");
+    printf("%-18s, %-12s, %-10s, %-6s, %-10s, %-9s, %-8s, %-7s\n", "Matcher",
+           "max_matches", "size", "loops", "total_sec", "avg_time", "max_bw",
+           "avg_bw");
     for (int m = 0; m < 2; m++) {
         for (size_t i = 0; i < std::size(sizes); i++) {
             MicroBenchmark bench("Shufti", sizes[i]);

From 3b37add4d87f223cc40e04d331e96beeb98f2d30 Mon Sep 17 00:00:00 2001
From: "G.E." <gregory.economou@vectorcamp.gr>
Date: Wed, 17 Apr 2024 11:33:00 +0300
Subject: [PATCH 13/56] the rpath hack is only needed on arm

---
 cmake/osdetection.cmake | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cmake/osdetection.cmake b/cmake/osdetection.cmake
index 3369447a..8bfbd3bd 100644
--- a/cmake/osdetection.cmake
+++ b/cmake/osdetection.cmake
@@ -4,12 +4,14 @@ endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
 
 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     set(FREEBSD true)
-    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-    #FIXME: find a nicer and more general way of doing this
-    if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12")
-        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
-    elseif(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
-        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
+    if(ARCH_AARCH64)
+        set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+        #FIXME: find a nicer and more general way of doing this
+        if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12")
+            set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
+        elseif(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
+            set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
+        endif()
     endif()
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 

From f2db0cdf01560cc8a6531989a23a3e1e4ead522a Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 17 Apr 2024 13:33:48 +0300
Subject: [PATCH 14/56] gcc-14 compilation fix Closes:#245

---
 src/util/supervector/arch/x86/impl.cpp | 1735 ++++++++++++++----------
 1 file changed, 987 insertions(+), 748 deletions(-)

diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index b8a75c95..e0e9d966 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -35,170 +35,155 @@
 
 #include "ue2common.h"
 #include "util/arch.h"
-#include "util/unaligned.h"
 #include "util/supervector/supervector.hpp"
+#include "util/unaligned.h"
 
 // 128-bit SSE implementation
-#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && (defined(HAVE_AVX2) || defined(HAVE_AVX512))) && defined(HAVE_SIMD_128_BITS)
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) &&                       \
+      (defined(HAVE_AVX2) || defined(HAVE_AVX512))) &&                         \
+    defined(HAVE_SIMD_128_BITS)
 
-template<>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other)
-{
+template <>
+really_inline SuperVector<16>::SuperVector(SuperVector const &other) {
     u.v128[0] = other.u.v128[0];
 }
 
-template<>
-really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
-{
+template <>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v) {
     u.v128[0] = v;
 };
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int8_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(int8_t const other) {
     u.v128[0] = _mm_set1_epi8(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint8_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(uint8_t const other) {
     u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int16_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(int16_t const other) {
     u.v128[0] = _mm_set1_epi16(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint16_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(uint16_t const other) {
     u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int32_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(int32_t const other) {
     u.v128[0] = _mm_set1_epi32(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint32_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(uint32_t const other) {
     u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int64_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(int64_t const other) {
     u.v128[0] = _mm_set1_epi64x(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint64_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<16>::SuperVector(uint64_t const other) {
     u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
 }
 
 // Constants
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones()
-{
+template <> really_inline SuperVector<16> SuperVector<16>::Ones() {
     return {_mm_set1_epi8(0xFF)};
 }
 
-template<>
-really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
-{
+template <> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) {
     return {_mm_set1_epi8(0)};
 }
 
 // Methods
 
 template <>
-really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
-{
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) {
     u.v128[0] = other.u.v128[0];
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator&(SuperVector<16> const &b) const {
     return {_mm_and_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator|(SuperVector<16> const &b) const {
     return {_mm_or_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator^(SuperVector<16> const &b) const {
     return {_mm_xor_si128(u.v128[0], b.u.v128[0])};
 }
 
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator!() const
-{
+template <> really_inline SuperVector<16> SuperVector<16>::operator!() const {
     return {_mm_xor_si128(u.v128[0], u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::opandnot(SuperVector<16> const &b) const {
     return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator==(SuperVector<16> const &b) const {
     return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator!=(SuperVector<16> const &b) const {
     return !(*this == b);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator>(SuperVector<16> const &b) const {
     return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator<(SuperVector<16> const &b) const {
     return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator>=(SuperVector<16> const &b) const {
     return !(*this < b);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator<=(SuperVector<16> const &b) const {
     return !(*this > b);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::eq(SuperVector<16> const &b) const {
     return (*this == b);
 }
 
@@ -232,37 +217,32 @@ SuperVector<16>::iteration_mask(
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const {
     return {_mm_slli_epi16(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const {
     return {_mm_slli_epi32(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const {
     return {_mm_slli_epi64(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const {
     return {_mm_slli_si128(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const {
     return vshl_128_imm<N>();
 }
 
@@ -274,37 +254,32 @@ really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const {
     return {_mm_srli_epi16(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const {
     return {_mm_srli_epi32(u.v128[0], N)};
 }
-  
+
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const {
     return {_mm_srli_epi64(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const {
     return {_mm_srli_si128(u.v128[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const {
     return vshr_128_imm<N>();
 }
 
@@ -322,156 +297,196 @@ template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N)
+// const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
+//     {_mm_slli_epi8(v->u.v128[0], i)}; }); if (N == 16) return Zeroes();
 // }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshl_16(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi16(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_slli_epi16(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshl_32(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi32(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_slli_epi32(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshl_64(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi64(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_slli_epi64(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_slli_si128(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const {
     return vshl_128(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
+// const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) result = Zeroes();
-//     return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
+//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
+//     Zeroes(); return result;
 // }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshr_16(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi16(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_srli_epi16(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshr_32(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi32(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_srli_epi32(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshr_64(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi64(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_srli_epi64(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
 #endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; });
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm_srli_si128(v->u.v128[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
-{
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const {
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator>>(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
@@ -481,8 +496,8 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
-{
+really_inline SuperVector<16>
+SuperVector<16>::operator<<(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
@@ -491,45 +506,45 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
     return vshl_128(N);
 }
 
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
-{
-    if (N == 0) return Ones();
-    else return Ones().vshr_128(N);
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
-{
-    if (N == 0) return Ones();
-    else return Ones().vshr_128(N);
+template <>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) {
+    if (N == 0)
+        return Ones();
+    else
+        return Ones().vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
-{
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) {
+    if (N == 0)
+        return Ones();
+    else
+        return Ones().vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) {
     return _mm_loadu_si128((const m128 *)ptr);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
-{
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return _mm_load_si128((const m128 *)ptr);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
-{
-    SuperVector mask = Ones_vshr(16 -len);
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr,
+                                                           uint8_t const len) {
+    SuperVector mask = Ones_vshr(16 - len);
     SuperVector v = _mm_loadu_si128((const m128 *)ptr);
     return mask & v;
 }
 
-template<>
-really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
-{
+template <>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other,
+                                                      int8_t offset) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
@@ -539,224 +554,239 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in
         }
     }
 #endif
-    switch(offset) {
-    case 0: return other; break;
-    case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
-    case 2: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)}; break;
-    case 3: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)}; break;
-    case 4: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)}; break;
-    case 5: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)}; break;
-    case 6: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)}; break;
-    case 7: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)}; break;
-    case 8: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)}; break;
-    case 9: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)}; break;
-    case 10: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)}; break;
-    case 11: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)}; break;
-    case 12: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)}; break;
-    case 13: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)}; break;
-    case 14: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)}; break;
-    case 15: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)}; break;
-    default: break;
+    switch (offset) {
+    case 0:
+        return other;
+        break;
+    case 1:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)};
+        break;
+    case 2:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)};
+        break;
+    case 3:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)};
+        break;
+    case 4:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)};
+        break;
+    case 5:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)};
+        break;
+    case 6:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)};
+        break;
+    case 7:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)};
+        break;
+    case 8:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)};
+        break;
+    case 9:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)};
+        break;
+    case 10:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)};
+        break;
+    case 11:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)};
+        break;
+    case 12:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)};
+        break;
+    case 13:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)};
+        break;
+    case 14:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)};
+        break;
+    case 15:
+        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)};
+        break;
+    default:
+        break;
     }
     return *this;
 }
 
-template<>
-template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
-{
+template <>
+template <>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b) {
     return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
 }
 
-template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
-{
-    SuperVector mask = Ones_vshr(16 -len);
+template <>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b,
+                                                            uint8_t const len) {
+    SuperVector mask = Ones_vshr(16 - len);
     return mask & pshufb(b);
 }
 
 #endif // !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
 
 // 256-bit AVX2 implementation
-#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && defined(HAVE_AVX512)) && defined(HAVE_AVX2)
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) &&                       \
+      defined(HAVE_AVX512)) &&                                                 \
+    defined(HAVE_AVX2)
 
-template<>
-really_inline SuperVector<32>::SuperVector(SuperVector const &other)
-{
+template <>
+really_inline SuperVector<32>::SuperVector(SuperVector const &other) {
     u.v256[0] = other.u.v256[0];
 }
 
-template<>
-really_inline SuperVector<32>::SuperVector(typename base_type::type const v)
-{
+template <>
+really_inline SuperVector<32>::SuperVector(typename base_type::type const v) {
     u.v256[0] = v;
 };
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(m128 const v)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(m128 const v) {
     u.v256[0] = _mm256_broadcastsi128_si256(v);
 };
 
-template<>
-really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi)
-{
+template <>
+really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi) {
     u.v128[0] = lo;
     u.v128[1] = hi;
 };
 
-template<>
-really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector<16> const hi)
-{
+template <>
+really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo,
+                                           SuperVector<16> const hi) {
     u.v128[0] = lo.u.v128[0];
     u.v128[1] = hi.u.v128[0];
 };
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(int8_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(int8_t const other) {
     u.v256[0] = _mm256_set1_epi8(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(uint8_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(uint8_t const other) {
     u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(int16_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(int16_t const other) {
     u.v256[0] = _mm256_set1_epi16(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(uint16_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(uint16_t const other) {
     u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(int32_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(int32_t const other) {
     u.v256[0] = _mm256_set1_epi32(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(uint32_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(uint32_t const other) {
     u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(int64_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(int64_t const other) {
     u.v256[0] = _mm256_set1_epi64x(other);
 }
 
-template<>
-template<>
-really_inline SuperVector<32>::SuperVector(uint64_t const other)
-{
+template <>
+template <>
+really_inline SuperVector<32>::SuperVector(uint64_t const other) {
     u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
 }
 
 // Constants
-template<>
-really_inline SuperVector<32> SuperVector<32>::Ones(void)
-{
+template <> really_inline SuperVector<32> SuperVector<32>::Ones(void) {
     return {_mm256_set1_epi8(0xFF)};
 }
 
-template<>
-really_inline SuperVector<32> SuperVector<32>::Zeroes(void)
-{
+template <> really_inline SuperVector<32> SuperVector<32>::Zeroes(void) {
     return {_mm256_set1_epi8(0)};
 }
 
 template <>
-really_inline void SuperVector<32>::operator=(SuperVector<32> const &other)
-{
+really_inline void SuperVector<32>::operator=(SuperVector<32> const &other) {
     u.v256[0] = other.u.v256[0];
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator&(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator&(SuperVector<32> const &b) const {
     return {_mm256_and_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator|(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator|(SuperVector<32> const &b) const {
     return {_mm256_or_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator^(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator^(SuperVector<32> const &b) const {
     return {_mm256_xor_si256(u.v256[0], b.u.v256[0])};
 }
 
-template <>
-really_inline SuperVector<32> SuperVector<32>::operator!() const
-{
+template <> really_inline SuperVector<32> SuperVector<32>::operator!() const {
     return {_mm256_xor_si256(u.v256[0], u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::opandnot(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::opandnot(SuperVector<32> const &b) const {
     return {_mm256_andnot_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator==(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator==(SuperVector<32> const &b) const {
     return {_mm256_cmpeq_epi8(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator!=(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator!=(SuperVector<32> const &b) const {
     return !(*this == b);
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator>(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator>(SuperVector<32> const &b) const {
     return {_mm256_cmpgt_epi8(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator<(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator<(SuperVector<32> const &b) const {
     return (b > *this);
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator>=(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator>=(SuperVector<32> const &b) const {
     return !(*this < b);
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator<=(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator<=(SuperVector<32> const &b) const {
     return !(*this > b);
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::eq(SuperVector<32> const &b) const {
     return (*this == b);
 }
 
@@ -790,51 +820,56 @@ SuperVector<32>::iteration_mask(
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const {
     return {_mm256_slli_epi16(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const {
     return {_mm256_slli_epi32(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const {
     return {_mm256_slli_epi64(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const {
     return {_mm256_slli_si256(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
-{
-    if (N == 0) return *this;
-    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
-    if (N == 32) return Zeroes();
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const {
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32)
+        return Zeroes();
     if (N < 16) {
-        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+        return {_mm256_alignr_epi8(
+            u.v256[0],
+            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                      _MM_SHUFFLE(0, 0, 2, 0)),
+            16 - N)};
     } else {
-        return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
+        return {_mm256_slli_si256(
+            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                      _MM_SHUFFLE(0, 0, 2, 0)),
+            N - 16)};
     }
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_imm() const {
     return vshl_256_imm<N>();
 }
 
@@ -846,51 +881,56 @@ really_inline SuperVector<32> SuperVector<32>::vshl_imm() const
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const {
     return {_mm256_srli_epi16(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const {
     return {_mm256_srli_epi32(u.v256[0], N)};
 }
-  
+
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const {
     return {_mm256_srli_epi64(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const {
     return {_mm256_srli_si256(u.v256[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const
-{
-    if (N == 0) return *this;
-    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
-    if (N == 32) return Zeroes();
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const {
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32)
+        return Zeroes();
     if (N < 16) {
-        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+        return {_mm256_alignr_epi8(
+            u.v256[0],
+            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                      _MM_SHUFFLE(0, 0, 2, 0)),
+            16 - N)};
     } else {
-        return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
+        return {_mm256_srli_si256(
+            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                      _MM_SHUFFLE(2, 0, 0, 1)),
+            N - 16)};
     }
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_imm() const {
     return vshr_256_imm<N>();
 }
 
@@ -910,161 +950,233 @@ template SuperVector<32> SuperVector<32>::vshr_imm<1>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N)
+// const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm256_slli_epi8(v->u.v256[0], i)}; });
-//     if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
+//     {_mm256_slli_epi8(v->u.v256[0], i)}; }); if (N == 16) return Zeroes();
 // }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_16 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
+really_inline SuperVector<32> SuperVector<32>::vshl_16(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi16(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi32(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi64(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_si256(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) {
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
-    });
-    Unroller<17, 32>::iterator([&,v=this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
+        if (N == n)
+            result = {_mm256_slli_epi16(v->u.v256[0], n)};
     });
     return result;
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const
-{
+really_inline SuperVector<32> SuperVector<32>::vshl_32(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_slli_epi32(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_64(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_slli_epi64(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_slli_si256(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_alignr_epi8(
+                u.v256[0],
+                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0)),
+                16 - n)};
+        ;
+    });
+    Unroller<17, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_slli_si256(
+                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0)),
+                n - 16)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const {
     return vshl_256(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
+// const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) result = Zeroes();
-//     return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
+//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
+//     Zeroes(); return result;
 // }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_16 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
+really_inline SuperVector<32> SuperVector<32>::vshr_16(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi16(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi32(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi64(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_si256(v->u.v256[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
-    if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) {
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_alignr_epi8(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), v->u.v256[0], n)};
-    });
-    Unroller<17, 32>::iterator([&,v=this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_srli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), n - 16)};
+        if (N == n)
+            result = {_mm256_srli_epi16(v->u.v256[0], n)};
     });
     return result;
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const
-{
+really_inline SuperVector<32> SuperVector<32>::vshr_32(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_srli_epi32(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_64(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_srli_epi64(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_srli_si256(v->u.v256[0], n)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32)
+        return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_alignr_epi8(
+                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1)),
+                v->u.v256[0], n)};
+    });
+    Unroller<17, 32>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm256_srli_si256(
+                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1)),
+                n - 16)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const {
     return vshr_256(N);
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator>>(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
-        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        // As found here:
+        // https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
         if (N < 16) {
-            return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)};
+            return {_mm256_alignr_epi8(
+                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1)),
+                u.v256[0], N)};
         } else if (N == 16) {
-            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                              _MM_SHUFFLE(2, 0, 0, 1))};
         } else {
-            return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
+            return {_mm256_srli_si256(
+                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(2, 0, 0, 1)),
+                N - 16)};
         }
     }
 #endif
@@ -1072,37 +1184,46 @@ really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
-{
+really_inline SuperVector<32>
+SuperVector<32>::operator<<(uint8_t const N) const {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
-        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        // As found here:
+        // https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
         if (N < 16) {
-            return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+            return {_mm256_alignr_epi8(
+                u.v256[0],
+                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0)),
+                16 - N)};
         } else if (N == 16) {
-            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                              _MM_SHUFFLE(0, 0, 2, 0))};
         } else {
-            return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
+            return {_mm256_slli_si256(
+                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
+                                          _MM_SHUFFLE(0, 0, 2, 0)),
+                N - 16)};
         }
     }
 #endif
     return vshl_256(N);
 }
 
-template<>
-really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N)
-{
-    if (N == 0) return Ones();
+template <>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N) {
+    if (N == 0)
+        return Ones();
     if (N >= 16)
         return {SuperVector<16>::Ones_vshr(N - 16), SuperVector<16>::Zeroes()};
     else
         return {SuperVector<16>::Ones(), SuperVector<16>::Ones_vshr(N)};
 }
 
-template<>
-really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N)
-{
-    if (N == 0) return Ones();
+template <>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N) {
+    if (N == 0)
+        return Ones();
     if (N >= 16)
         return {SuperVector<16>::Zeroes(), SuperVector<16>::Ones_vshl(N - 16)};
     else
@@ -1110,30 +1231,29 @@ really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N)
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr)
-{
+really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr) {
     return {_mm256_loadu_si256((const m256 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
-{
+really_inline SuperVector<32> SuperVector<32>::load(void const *ptr) {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return {_mm256_load_si256((const m256 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
-{
+really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr,
+                                                           uint8_t const len) {
 #ifdef HAVE_AVX512
     u32 mask = (~0ULL) >> (32 - len);
-    SuperVector<32> v = _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
+    SuperVector<32> v =
+        _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
     v.print8("v");
     return v;
 #else
     DEBUG_PRINTF("len = %d", len);
-    SuperVector<32> mask = Ones_vshr(32 -len);
+    SuperVector<32> mask = Ones_vshr(32 - len);
     mask.print8("mask");
     (Ones() >> (32 - len)).print8("mask");
     SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
@@ -1142,10 +1262,11 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint
 #endif
 }
 
-template<>
-really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
-{
-#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && (__GNUC__ == 13))
+template <>
+really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other,
+                                                      int8_t offset) {
+#if defined(HAVE__BUILTIN_CONSTANT_P) &&                                       \
+    !(defined(__GNUC__) && ((__GNUC__ == 13) || (__GNUC__ == 14)))
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
             return *this;
@@ -1154,262 +1275,359 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in
         }
     }
 #endif
-    // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
-    switch (offset){ 
-    case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break;
-    case 1 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1)); break;
-    case 2 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2)); break;
-    case 3 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3)); break;
-    case 4 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4)); break;
-    case 5 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5)); break;
-    case 6 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6)); break;
-    case 7 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7)); break;
-    case 8 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8)); break;
-    case 9 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9)); break;
-    case 10 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10)); break;
-    case 11 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11)); break;
-    case 12 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12)); break;
-    case 13 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13)); break;
-    case 14 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14)); break;
-    case 15 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15)); break;
-    case 16 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0)); break;
-    case 17 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1)); break;
-    case 18 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2)); break;
-    case 19 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3)); break;
-    case 20 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4)); break;
-    case 21 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5)); break;
-    case 22 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6)); break;
-    case 23 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7)); break;
-    case 24 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8)); break;
-    case 25 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9)); break;
-    case 26 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10)); break;
-    case 27 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11)); break;
-    case 28 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12)); break;
-    case 29 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13)); break;
-    case 30 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14)); break;
-    case 31 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15)); break;  
-    default: break;
+    // As found here:
+    // https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
+    switch (offset) {
+    case 0:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0));
+        break;
+    case 1:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1));
+        break;
+    case 2:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2));
+        break;
+    case 3:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3));
+        break;
+    case 4:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4));
+        break;
+    case 5:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5));
+        break;
+    case 6:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6));
+        break;
+    case 7:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7));
+        break;
+    case 8:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8));
+        break;
+    case 9:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9));
+        break;
+    case 10:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10));
+        break;
+    case 11:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11));
+        break;
+    case 12:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12));
+        break;
+    case 13:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13));
+        break;
+    case 14:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14));
+        break;
+    case 15:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15),
+            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15));
+        break;
+    case 16:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0));
+        break;
+    case 17:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1));
+        break;
+    case 18:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2));
+        break;
+    case 19:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3));
+        break;
+    case 20:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4));
+        break;
+    case 21:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5));
+        break;
+    case 22:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6));
+        break;
+    case 23:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7));
+        break;
+    case 24:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8));
+        break;
+    case 25:
+        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9),
+                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9));
+        break;
+    case 26:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 10),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10));
+        break;
+    case 27:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 11),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11));
+        break;
+    case 28:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 12),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12));
+        break;
+    case 29:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 13),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13));
+        break;
+    case 30:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 14),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14));
+        break;
+    case 31:
+        return _mm256_set_m128i(
+            _mm_alignr_epi8(u.v128[1], u.v128[0], 15),
+            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15));
+        break;
+    default:
+        break;
     }
     return *this;
 }
 
-template<>
-template<>
-really_inline SuperVector<32> SuperVector<32>::pshufb<true>(SuperVector<32> b)
-{
+template <>
+template <>
+really_inline SuperVector<32> SuperVector<32>::pshufb<true>(SuperVector<32> b) {
     return {_mm256_shuffle_epi8(u.v256[0], b.u.v256[0])};
 }
 
-template<>
-really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b, uint8_t const len)
-{
-    SuperVector<32> mask = Ones_vshr(32 -len);
+template <>
+really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b,
+                                                            uint8_t const len) {
+    SuperVector<32> mask = Ones_vshr(32 - len);
     return mask & pshufb(b);
 }
 
 #endif // HAVE_AVX2
 
-
 // 512-bit AVX512 implementation
 #if defined(HAVE_AVX512)
 
-template<>
-really_inline SuperVector<64>::SuperVector(SuperVector const &o)
-{
+template <> really_inline SuperVector<64>::SuperVector(SuperVector const &o) {
     u.v512[0] = o.u.v512[0];
 }
 
-template<>
-really_inline SuperVector<64>::SuperVector(typename base_type::type const v)
-{
+template <>
+really_inline SuperVector<64>::SuperVector(typename base_type::type const v) {
     u.v512[0] = v;
 };
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(m256 const v)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(m256 const v) {
     u.v512[0] = _mm512_broadcast_i64x4(v);
 };
 
-template<>
-really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi)
-{
+template <>
+really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi) {
     u.v256[0] = lo;
     u.v256[1] = hi;
 };
 
-template<>
-really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo, SuperVector<32> const hi)
-{
+template <>
+really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo,
+                                           SuperVector<32> const hi) {
     u.v256[0] = lo.u.v256[0];
     u.v256[1] = hi.u.v256[0];
 };
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(m128 const v)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(m128 const v) {
     u.v512[0] = _mm512_broadcast_i32x4(v);
 };
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(int8_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(int8_t const o) {
     u.v512[0] = _mm512_set1_epi8(o);
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(uint8_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(uint8_t const o) {
     u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(int16_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(int16_t const o) {
     u.v512[0] = _mm512_set1_epi16(o);
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(uint16_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(uint16_t const o) {
     u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(int32_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(int32_t const o) {
     u.v512[0] = _mm512_set1_epi32(o);
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(uint32_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(uint32_t const o) {
     u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(int64_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(int64_t const o) {
     u.v512[0] = _mm512_set1_epi64(o);
 }
 
-template<>
-template<>
-really_inline SuperVector<64>::SuperVector(uint64_t const o)
-{
+template <>
+template <>
+really_inline SuperVector<64>::SuperVector(uint64_t const o) {
     u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
 }
 
 // Constants
-template<>
-really_inline SuperVector<64> SuperVector<64>::Ones(void)
-{
+template <> really_inline SuperVector<64> SuperVector<64>::Ones(void) {
     return {_mm512_set1_epi8(0xFF)};
 }
 
-template<>
-really_inline SuperVector<64> SuperVector<64>::Zeroes(void)
-{
+template <> really_inline SuperVector<64> SuperVector<64>::Zeroes(void) {
     return {_mm512_set1_epi8(0)};
 }
 
 // Methods
 template <>
-really_inline void SuperVector<64>::operator=(SuperVector<64> const &o)
-{
+really_inline void SuperVector<64>::operator=(SuperVector<64> const &o) {
     u.v512[0] = o.u.v512[0];
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator&(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator&(SuperVector<64> const &b) const {
     return {_mm512_and_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator|(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator|(SuperVector<64> const &b) const {
     return {_mm512_or_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator^(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator^(SuperVector<64> const &b) const {
     return {_mm512_xor_si512(u.v512[0], b.u.v512[0])};
 }
 
-template <>
-really_inline SuperVector<64> SuperVector<64>::operator!() const
-{
+template <> really_inline SuperVector<64> SuperVector<64>::operator!() const {
     return {_mm512_xor_si512(u.v512[0], u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::opandnot(SuperVector<64> const &b) const {
     return {_mm512_andnot_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator==(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator!=(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator>(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator<(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator>=(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator<=(SuperVector<64> const &b) const {
     SuperVector<64>::comparemask_type mask =
         _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::eq(SuperVector<64> const &b) const {
     return (*this == b);
 }
 
@@ -1445,51 +1663,44 @@ SuperVector<64>::iteration_mask(
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const {
     return {_mm512_slli_epi16(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const {
     return {_mm512_slli_epi32(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const {
     return {_mm512_slli_epi64(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const {
     return {_mm512_bslli_epi128(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const {
     return {};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const {
     return {};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_imm() const {
     return vshl_512_imm<N>();
 }
 
@@ -1501,51 +1712,44 @@ really_inline SuperVector<64> SuperVector<64>::vshl_imm() const
 // }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const {
     return {_mm512_srli_epi16(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const {
     return {_mm512_srli_epi32(u.v512[0], N)};
 }
-  
+
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const {
     return {_mm512_srli_epi64(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const {
     return {_mm512_bsrli_epi128(u.v512[0], N)};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const {
     return {};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const {
     return {};
 }
 
 template <>
-template<uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_imm() const
-{
+template <uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_imm() const {
     return vshr_512_imm<N>();
 }
 
@@ -1563,150 +1767,186 @@ template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<64> SuperVector<64>::vshl_8  (uint8_t const N) const
+// really_inline SuperVector<64> SuperVector<64>::vshl_8  (uint8_t const N)
+// const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
+//     {_mm_slli_epi8(v->u.v128[0], i)}; }); if (N == 16) return Zeroes();
 // }
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_16 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_16(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi16(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_slli_epi16(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_32(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi32(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_slli_epi32(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_64(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi64(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_slli_epi64(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bslli_epi128(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_bslli_epi128(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const {
     return vshl_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const {
     return vshl_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const {
     return vshl_512(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
+// const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) result = Zeroes();
-//     return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
+//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
+//     Zeroes(); return result;
 // }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_16 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_16(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi16(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_srli_epi16(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_32(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi32(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_srli_epi32(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_64(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 16)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi64(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_srli_epi64(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 64) return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const {
+    if (N == 0)
+        return *this;
+    if (N == 64)
+        return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bsrli_epi128(v->u.v512[0], n)}; });
+    Unroller<1, 64>::iterator([&, v = this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n)
+            result = {_mm512_bsrli_epi128(v->u.v512[0], n)};
+    });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const {
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const {
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const
-{
+really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const {
     return vshr_512(N);
 }
 
-template<>
-really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N)
-{
-    if (N == 0) return Ones();
+template <>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N) {
+    if (N == 0)
+        return Ones();
     if (N >= 32)
         return {SuperVector<32>::Ones_vshr(N - 32), SuperVector<32>::Zeroes()};
     else
         return {SuperVector<32>::Ones(), SuperVector<32>::Ones_vshr(N)};
 }
 
-template<>
-really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N)
-{
-    if (N == 0) return Ones();
+template <>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N) {
+    if (N == 0)
+        return Ones();
     if (N >= 32)
         return {SuperVector<32>::Zeroes(), SuperVector<32>::Ones_vshl(N - 32)};
     else
@@ -1714,8 +1954,8 @@ really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N)
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator>>(uint8_t const N) const {
     if (N == 0) {
         return *this;
     } else if (N < 32) {
@@ -1737,8 +1977,8 @@ really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
-{
+really_inline SuperVector<64>
+SuperVector<64>::operator<<(uint8_t const N) const {
     if (N == 0) {
         return *this;
     } else if (N < 32) {
@@ -1760,48 +2000,47 @@ really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr)
-{
+really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr) {
     return {_mm512_loadu_si512((const m512 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::load(void const *ptr)
-{
+really_inline SuperVector<64> SuperVector<64>::load(void const *ptr) {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return {_mm512_load_si512((const m512 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint8_t const len)
-{
+really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr,
+                                                           uint8_t const len) {
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask = %016llx\n", mask);
-    SuperVector<64> v = _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
+    SuperVector<64> v =
+        _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
     v.print8("v");
     return v;
 }
 
-template<>
-template<>
-really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b)
-{
+template <>
+template <>
+really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b) {
     return {_mm512_shuffle_epi8(u.v512[0], b.u.v512[0])};
 }
 
-template<>
-really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, uint8_t const len)
-{
+template <>
+really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b,
+                                                            uint8_t const len) {
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask = %016llx\n", mask);
     return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])};
 }
 
-template<>
-really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
-{
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+template <>
+really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l,
+                                                      int8_t offset) {
+#if defined(HAVE__BUILTIN_CONSTANT_P) &&                                       \
+    !(defined(__GNUC__) && (__GNUC__ == 14))
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
             return *this;
@@ -1810,21 +2049,21 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t
         }
     }
 #endif
-    if(offset == 0) {
+    if (offset == 0) {
         return *this;
-    } else if (offset < 32){
+    } else if (offset < 32) {
         SuperVector<32> lo256 = u.v256[0];
         SuperVector<32> hi256 = u.v256[1];
         SuperVector<32> o_lo256 = l.u.v256[0];
-        SuperVector<32> carry1 = hi256.alignr(lo256,offset);
-        SuperVector<32> carry2 = o_lo256.alignr(hi256,offset);
+        SuperVector<32> carry1 = hi256.alignr(lo256, offset);
+        SuperVector<32> carry2 = o_lo256.alignr(hi256, offset);
         return SuperVector(carry1, carry2);
-    } else if (offset <= 64){
+    } else if (offset <= 64) {
         SuperVector<32> hi256 = u.v256[1];
         SuperVector<32> o_lo256 = l.u.v256[0];
         SuperVector<32> o_hi256 = l.u.v256[1];
         SuperVector<32> carry1 = o_lo256.alignr(hi256, offset - 32);
-        SuperVector<32> carry2 = o_hi256.alignr(o_lo256,offset -32);
+        SuperVector<32> carry2 = o_hi256.alignr(o_lo256, offset - 32);
         return SuperVector(carry1, carry2);
     } else {
         return *this;

From 51ac3a2287a47f59396df6ee34b04c99ac1ced3e Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 17 Apr 2024 13:55:42 +0300
Subject: [PATCH 15/56] clang-format revert

---
 src/util/supervector/arch/x86/impl.cpp | 1735 ++++++++++--------------
 1 file changed, 748 insertions(+), 987 deletions(-)

diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index e0e9d966..d83f6792 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -35,155 +35,170 @@
 
 #include "ue2common.h"
 #include "util/arch.h"
-#include "util/supervector/supervector.hpp"
 #include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
 
 // 128-bit SSE implementation
-#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) &&                       \
-      (defined(HAVE_AVX2) || defined(HAVE_AVX512))) &&                         \
-    defined(HAVE_SIMD_128_BITS)
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && (defined(HAVE_AVX2) || defined(HAVE_AVX512))) && defined(HAVE_SIMD_128_BITS)
 
-template <>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other) {
+template<>
+really_inline SuperVector<16>::SuperVector(SuperVector const &other)
+{
     u.v128[0] = other.u.v128[0];
 }
 
-template <>
-really_inline SuperVector<16>::SuperVector(typename base_type::type const v) {
+template<>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
+{
     u.v128[0] = v;
 };
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(int8_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8_t const other)
+{
     u.v128[0] = _mm_set1_epi8(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(uint8_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
+{
     u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(int16_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16_t const other)
+{
     u.v128[0] = _mm_set1_epi16(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(uint16_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
+{
     u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(int32_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32_t const other)
+{
     u.v128[0] = _mm_set1_epi32(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(uint32_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
+{
     u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(int64_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64_t const other)
+{
     u.v128[0] = _mm_set1_epi64x(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<16>::SuperVector(uint64_t const other) {
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
+{
     u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
 }
 
 // Constants
-template <> really_inline SuperVector<16> SuperVector<16>::Ones() {
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones()
+{
     return {_mm_set1_epi8(0xFF)};
 }
 
-template <> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) {
+template<>
+really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
+{
     return {_mm_set1_epi8(0)};
 }
 
 // Methods
 
 template <>
-really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) {
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
+{
     u.v128[0] = other.u.v128[0];
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator&(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
+{
     return {_mm_and_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator|(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
+{
     return {_mm_or_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator^(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
+{
     return {_mm_xor_si128(u.v128[0], b.u.v128[0])};
 }
 
-template <> really_inline SuperVector<16> SuperVector<16>::operator!() const {
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!() const
+{
     return {_mm_xor_si128(u.v128[0], u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::opandnot(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
+{
     return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator==(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
+{
     return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator!=(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
     return !(*this == b);
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator>(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
     return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator<(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
     return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator>=(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
     return !(*this < b);
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator<=(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
     return !(*this > b);
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::eq(SuperVector<16> const &b) const {
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
     return (*this == b);
 }
 
@@ -217,32 +232,37 @@ SuperVector<16>::iteration_mask(
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
     return {_mm_slli_epi16(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
     return {_mm_slli_epi32(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
     return {_mm_slli_epi64(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
     return {_mm_slli_si128(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
     return vshl_128_imm<N>();
 }
 
@@ -254,32 +274,37 @@ really_inline SuperVector<16> SuperVector<16>::vshl_imm() const {
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
     return {_mm_srli_epi16(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
     return {_mm_srli_epi32(u.v128[0], N)};
 }
-
+  
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
     return {_mm_srli_epi64(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
     return {_mm_srli_si128(u.v128[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_imm() const {
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
     return vshr_128_imm<N>();
 }
 
@@ -297,196 +322,156 @@ template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N)
-// const
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
-//     {_mm_slli_epi8(v->u.v128[0], i)}; }); if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) return Zeroes();
 // }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_16(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi16(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_slli_epi16(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_32(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi32(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_slli_epi32(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_64(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_epi64(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_slli_epi64(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_slli_si128(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
     return vshl_128(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
-// const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
-//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
-//     Zeroes(); return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
 // }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_16(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi16(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_srli_epi16(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_32(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi32(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_srli_epi32(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_64(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_epi64(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_srli_epi64(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
 #endif
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm_srli_si128(v->u.v128[0], n)};
-    });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator>>(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
@@ -496,8 +481,8 @@ SuperVector<16>::operator>>(uint8_t const N) const {
 }
 
 template <>
-really_inline SuperVector<16>
-SuperVector<16>::operator<<(uint8_t const N) const {
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
@@ -506,45 +491,45 @@ SuperVector<16>::operator<<(uint8_t const N) const {
     return vshl_128(N);
 }
 
-template <>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) {
-    if (N == 0)
-        return Ones();
-    else
-        return Ones().vshr_128(N);
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) {
-    if (N == 0)
-        return Ones();
-    else
-        return Ones().vshr_128(N);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) {
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
+{
     return _mm_loadu_si128((const m128 *)ptr);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) {
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
+{
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return _mm_load_si128((const m128 *)ptr);
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr,
-                                                           uint8_t const len) {
-    SuperVector mask = Ones_vshr(16 - len);
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
     SuperVector v = _mm_loadu_si128((const m128 *)ptr);
     return mask & v;
 }
 
-template <>
-really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other,
-                                                      int8_t offset) {
+template<>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
@@ -554,239 +539,224 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other,
         }
     }
 #endif
-    switch (offset) {
-    case 0:
-        return other;
-        break;
-    case 1:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)};
-        break;
-    case 2:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)};
-        break;
-    case 3:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)};
-        break;
-    case 4:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)};
-        break;
-    case 5:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)};
-        break;
-    case 6:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)};
-        break;
-    case 7:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)};
-        break;
-    case 8:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)};
-        break;
-    case 9:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)};
-        break;
-    case 10:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)};
-        break;
-    case 11:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)};
-        break;
-    case 12:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)};
-        break;
-    case 13:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)};
-        break;
-    case 14:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)};
-        break;
-    case 15:
-        return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)};
-        break;
-    default:
-        break;
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
+    case 2: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)}; break;
+    case 3: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)}; break;
+    case 4: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)}; break;
+    case 5: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)}; break;
+    case 6: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)}; break;
+    case 7: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)}; break;
+    case 8: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)}; break;
+    case 9: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)}; break;
+    case 10: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)}; break;
+    case 11: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)}; break;
+    case 12: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)}; break;
+    case 13: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)}; break;
+    case 14: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)}; break;
+    case 15: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)}; break;
+    default: break;
     }
     return *this;
 }
 
-template <>
-template <>
-really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b) {
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
+{
     return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
 }
 
-template <>
-really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b,
-                                                            uint8_t const len) {
-    SuperVector mask = Ones_vshr(16 - len);
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
     return mask & pshufb(b);
 }
 
 #endif // !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
 
 // 256-bit AVX2 implementation
-#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) &&                       \
-      defined(HAVE_AVX512)) &&                                                 \
-    defined(HAVE_AVX2)
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && defined(HAVE_AVX512)) && defined(HAVE_AVX2)
 
-template <>
-really_inline SuperVector<32>::SuperVector(SuperVector const &other) {
+template<>
+really_inline SuperVector<32>::SuperVector(SuperVector const &other)
+{
     u.v256[0] = other.u.v256[0];
 }
 
-template <>
-really_inline SuperVector<32>::SuperVector(typename base_type::type const v) {
+template<>
+really_inline SuperVector<32>::SuperVector(typename base_type::type const v)
+{
     u.v256[0] = v;
 };
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(m128 const v) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(m128 const v)
+{
     u.v256[0] = _mm256_broadcastsi128_si256(v);
 };
 
-template <>
-really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi) {
+template<>
+really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi)
+{
     u.v128[0] = lo;
     u.v128[1] = hi;
 };
 
-template <>
-really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo,
-                                           SuperVector<16> const hi) {
+template<>
+really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector<16> const hi)
+{
     u.v128[0] = lo.u.v128[0];
     u.v128[1] = hi.u.v128[0];
 };
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(int8_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int8_t const other)
+{
     u.v256[0] = _mm256_set1_epi8(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(uint8_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint8_t const other)
+{
     u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(int16_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int16_t const other)
+{
     u.v256[0] = _mm256_set1_epi16(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(uint16_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint16_t const other)
+{
     u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(int32_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int32_t const other)
+{
     u.v256[0] = _mm256_set1_epi32(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(uint32_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint32_t const other)
+{
     u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(int64_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int64_t const other)
+{
     u.v256[0] = _mm256_set1_epi64x(other);
 }
 
-template <>
-template <>
-really_inline SuperVector<32>::SuperVector(uint64_t const other) {
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint64_t const other)
+{
     u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
 }
 
 // Constants
-template <> really_inline SuperVector<32> SuperVector<32>::Ones(void) {
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones(void)
+{
     return {_mm256_set1_epi8(0xFF)};
 }
 
-template <> really_inline SuperVector<32> SuperVector<32>::Zeroes(void) {
+template<>
+really_inline SuperVector<32> SuperVector<32>::Zeroes(void)
+{
     return {_mm256_set1_epi8(0)};
 }
 
 template <>
-really_inline void SuperVector<32>::operator=(SuperVector<32> const &other) {
+really_inline void SuperVector<32>::operator=(SuperVector<32> const &other)
+{
     u.v256[0] = other.u.v256[0];
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator&(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator&(SuperVector<32> const &b) const
+{
     return {_mm256_and_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator|(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator|(SuperVector<32> const &b) const
+{
     return {_mm256_or_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator^(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator^(SuperVector<32> const &b) const
+{
     return {_mm256_xor_si256(u.v256[0], b.u.v256[0])};
 }
 
-template <> really_inline SuperVector<32> SuperVector<32>::operator!() const {
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator!() const
+{
     return {_mm256_xor_si256(u.v256[0], u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::opandnot(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::opandnot(SuperVector<32> const &b) const
+{
     return {_mm256_andnot_si256(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator==(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator==(SuperVector<32> const &b) const
+{
     return {_mm256_cmpeq_epi8(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator!=(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator!=(SuperVector<32> const &b) const
+{
     return !(*this == b);
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator>(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator>(SuperVector<32> const &b) const
+{
     return {_mm256_cmpgt_epi8(u.v256[0], b.u.v256[0])};
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator<(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator<(SuperVector<32> const &b) const
+{
     return (b > *this);
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator>=(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator>=(SuperVector<32> const &b) const
+{
     return !(*this < b);
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator<=(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::operator<=(SuperVector<32> const &b) const
+{
     return !(*this > b);
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::eq(SuperVector<32> const &b) const {
+really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) const
+{
     return (*this == b);
 }
 
@@ -820,56 +790,51 @@ SuperVector<32>::iteration_mask(
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const
+{
     return {_mm256_slli_epi16(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const
+{
     return {_mm256_slli_epi32(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const
+{
     return {_mm256_slli_epi64(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
+{
     return {_mm256_slli_si256(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const {
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0))};
-    if (N == 32)
-        return Zeroes();
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32) return Zeroes();
     if (N < 16) {
-        return {_mm256_alignr_epi8(
-            u.v256[0],
-            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                      _MM_SHUFFLE(0, 0, 2, 0)),
-            16 - N)};
+        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
     } else {
-        return {_mm256_slli_si256(
-            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                      _MM_SHUFFLE(0, 0, 2, 0)),
-            N - 16)};
+        return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
     }
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshl_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_imm() const
+{
     return vshl_256_imm<N>();
 }
 
@@ -881,56 +846,51 @@ really_inline SuperVector<32> SuperVector<32>::vshl_imm() const {
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const
+{
     return {_mm256_srli_epi16(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const
+{
     return {_mm256_srli_epi32(u.v256[0], N)};
 }
-
+  
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const
+{
     return {_mm256_srli_epi64(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const
+{
     return {_mm256_srli_si256(u.v256[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const {
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1))};
-    if (N == 32)
-        return Zeroes();
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32) return Zeroes();
     if (N < 16) {
-        return {_mm256_alignr_epi8(
-            u.v256[0],
-            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                      _MM_SHUFFLE(0, 0, 2, 0)),
-            16 - N)};
+        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
     } else {
-        return {_mm256_srli_si256(
-            _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                      _MM_SHUFFLE(2, 0, 0, 1)),
-            N - 16)};
+        return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
     }
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<32> SuperVector<32>::vshr_imm() const {
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_imm() const
+{
     return vshr_256_imm<N>();
 }
 
@@ -950,233 +910,161 @@ template SuperVector<32> SuperVector<32>::vshr_imm<1>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N)
-// const
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
-//     {_mm256_slli_epi8(v->u.v256[0], i)}; }); if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm256_slli_epi8(v->u.v256[0], i)}; });
+//     if (N == 16) return Zeroes();
 // }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_16(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
+really_inline SuperVector<32> SuperVector<32>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi16(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi32(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi64(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_si256(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_slli_epi16(v->u.v256[0], n)};
+        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
+    });
+    Unroller<17, 32>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
     });
     return result;
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_32(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_slli_epi32(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_64(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_slli_epi64(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_slli_si256(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0))};
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_alignr_epi8(
-                u.v256[0],
-                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0)),
-                16 - n)};
-        ;
-    });
-    Unroller<17, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_slli_si256(
-                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0)),
-                n - 16)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const {
+really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const
+{
     return vshl_256(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
-// const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
-//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
-//     Zeroes(); return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
 // }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_16(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
+really_inline SuperVector<32> SuperVector<32>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi16(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi32(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi64(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_si256(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_srli_epi16(v->u.v256[0], n)};
+        if (N == n) result = {_mm256_alignr_epi8(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), v->u.v256[0], n)};
+    });
+    Unroller<17, 32>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_srli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), n - 16)};
     });
     return result;
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_32(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_srli_epi32(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_64(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_srli_epi64(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_srli_si256(v->u.v256[0], n)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1))};
-    if (N == 32)
-        return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_alignr_epi8(
-                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1)),
-                v->u.v256[0], n)};
-    });
-    Unroller<17, 32>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm256_srli_si256(
-                _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1)),
-                n - 16)};
-    });
-    return result;
-}
-
-template <>
-really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const {
+really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const
+{
     return vshr_256(N);
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator>>(uint8_t const N) const {
+really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
-        // As found here:
-        // https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
         if (N < 16) {
-            return {_mm256_alignr_epi8(
-                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1)),
-                u.v256[0], N)};
+            return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)};
         } else if (N == 16) {
-            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                              _MM_SHUFFLE(2, 0, 0, 1))};
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
         } else {
-            return {_mm256_srli_si256(
-                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(2, 0, 0, 1)),
-                N - 16)};
+            return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
         }
     }
 #endif
@@ -1184,46 +1072,37 @@ SuperVector<32>::operator>>(uint8_t const N) const {
 }
 
 template <>
-really_inline SuperVector<32>
-SuperVector<32>::operator<<(uint8_t const N) const {
+really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
+{
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(N)) {
-        // As found here:
-        // https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
         if (N < 16) {
-            return {_mm256_alignr_epi8(
-                u.v256[0],
-                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0)),
-                16 - N)};
+            return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
         } else if (N == 16) {
-            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                              _MM_SHUFFLE(0, 0, 2, 0))};
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
         } else {
-            return {_mm256_slli_si256(
-                _mm256_permute2x128_si256(u.v256[0], u.v256[0],
-                                          _MM_SHUFFLE(0, 0, 2, 0)),
-                N - 16)};
+            return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
         }
     }
 #endif
     return vshl_256(N);
 }
 
-template <>
-really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N) {
-    if (N == 0)
-        return Ones();
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
     if (N >= 16)
         return {SuperVector<16>::Ones_vshr(N - 16), SuperVector<16>::Zeroes()};
     else
         return {SuperVector<16>::Ones(), SuperVector<16>::Ones_vshr(N)};
 }
 
-template <>
-really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N) {
-    if (N == 0)
-        return Ones();
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
     if (N >= 16)
         return {SuperVector<16>::Zeroes(), SuperVector<16>::Ones_vshl(N - 16)};
     else
@@ -1231,29 +1110,30 @@ really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N) {
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr) {
+really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr)
+{
     return {_mm256_loadu_si256((const m256 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::load(void const *ptr) {
+really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
+{
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return {_mm256_load_si256((const m256 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr,
-                                                           uint8_t const len) {
+really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
+{
 #ifdef HAVE_AVX512
     u32 mask = (~0ULL) >> (32 - len);
-    SuperVector<32> v =
-        _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
+    SuperVector<32> v = _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
     v.print8("v");
     return v;
 #else
     DEBUG_PRINTF("len = %d", len);
-    SuperVector<32> mask = Ones_vshr(32 - len);
+    SuperVector<32> mask = Ones_vshr(32 -len);
     mask.print8("mask");
     (Ones() >> (32 - len)).print8("mask");
     SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
@@ -1262,11 +1142,10 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr,
 #endif
 }
 
-template <>
-really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other,
-                                                      int8_t offset) {
-#if defined(HAVE__BUILTIN_CONSTANT_P) &&                                       \
-    !(defined(__GNUC__) && ((__GNUC__ == 13) || (__GNUC__ == 14)))
+template<>
+really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && ((__GNUC__ == 13) || (__GNUC__ == 14)))
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
             return *this;
@@ -1275,359 +1154,262 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other,
         }
     }
 #endif
-    // As found here:
-    // https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
-    switch (offset) {
-    case 0:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0));
-        break;
-    case 1:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1));
-        break;
-    case 2:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2));
-        break;
-    case 3:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3));
-        break;
-    case 4:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4));
-        break;
-    case 5:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5));
-        break;
-    case 6:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6));
-        break;
-    case 7:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7));
-        break;
-    case 8:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8));
-        break;
-    case 9:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9));
-        break;
-    case 10:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10));
-        break;
-    case 11:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11));
-        break;
-    case 12:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12));
-        break;
-    case 13:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13));
-        break;
-    case 14:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14));
-        break;
-    case 15:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15),
-            _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15));
-        break;
-    case 16:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0));
-        break;
-    case 17:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1));
-        break;
-    case 18:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2));
-        break;
-    case 19:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3));
-        break;
-    case 20:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4));
-        break;
-    case 21:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5));
-        break;
-    case 22:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6));
-        break;
-    case 23:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7));
-        break;
-    case 24:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8));
-        break;
-    case 25:
-        return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9),
-                                _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9));
-        break;
-    case 26:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 10),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10));
-        break;
-    case 27:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 11),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11));
-        break;
-    case 28:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 12),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12));
-        break;
-    case 29:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 13),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13));
-        break;
-    case 30:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 14),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14));
-        break;
-    case 31:
-        return _mm256_set_m128i(
-            _mm_alignr_epi8(u.v128[1], u.v128[0], 15),
-            _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15));
-        break;
-    default:
-        break;
+    // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
+    switch (offset){ 
+    case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break;
+    case 1 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1)); break;
+    case 2 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2)); break;
+    case 3 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3)); break;
+    case 4 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4)); break;
+    case 5 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5)); break;
+    case 6 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6)); break;
+    case 7 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7)); break;
+    case 8 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8)); break;
+    case 9 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9)); break;
+    case 10 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10)); break;
+    case 11 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11)); break;
+    case 12 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12)); break;
+    case 13 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13)); break;
+    case 14 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14)); break;
+    case 15 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15)); break;
+    case 16 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0)); break;
+    case 17 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1)); break;
+    case 18 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2)); break;
+    case 19 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3)); break;
+    case 20 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4)); break;
+    case 21 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5)); break;
+    case 22 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6)); break;
+    case 23 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7)); break;
+    case 24 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8)); break;
+    case 25 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9)); break;
+    case 26 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10)); break;
+    case 27 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11)); break;
+    case 28 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12)); break;
+    case 29 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13)); break;
+    case 30 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14)); break;
+    case 31 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15)); break;  
+    default: break;
     }
     return *this;
 }
 
-template <>
-template <>
-really_inline SuperVector<32> SuperVector<32>::pshufb<true>(SuperVector<32> b) {
+template<>
+template<>
+really_inline SuperVector<32> SuperVector<32>::pshufb<true>(SuperVector<32> b)
+{
     return {_mm256_shuffle_epi8(u.v256[0], b.u.v256[0])};
 }
 
-template <>
-really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b,
-                                                            uint8_t const len) {
-    SuperVector<32> mask = Ones_vshr(32 - len);
+template<>
+really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b, uint8_t const len)
+{
+    SuperVector<32> mask = Ones_vshr(32 -len);
     return mask & pshufb(b);
 }
 
 #endif // HAVE_AVX2
 
+
 // 512-bit AVX512 implementation
 #if defined(HAVE_AVX512)
 
-template <> really_inline SuperVector<64>::SuperVector(SuperVector const &o) {
+template<>
+really_inline SuperVector<64>::SuperVector(SuperVector const &o)
+{
     u.v512[0] = o.u.v512[0];
 }
 
-template <>
-really_inline SuperVector<64>::SuperVector(typename base_type::type const v) {
+template<>
+really_inline SuperVector<64>::SuperVector(typename base_type::type const v)
+{
     u.v512[0] = v;
 };
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(m256 const v) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(m256 const v)
+{
     u.v512[0] = _mm512_broadcast_i64x4(v);
 };
 
-template <>
-really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi) {
+template<>
+really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi)
+{
     u.v256[0] = lo;
     u.v256[1] = hi;
 };
 
-template <>
-really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo,
-                                           SuperVector<32> const hi) {
+template<>
+really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo, SuperVector<32> const hi)
+{
     u.v256[0] = lo.u.v256[0];
     u.v256[1] = hi.u.v256[0];
 };
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(m128 const v) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(m128 const v)
+{
     u.v512[0] = _mm512_broadcast_i32x4(v);
 };
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(int8_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int8_t const o)
+{
     u.v512[0] = _mm512_set1_epi8(o);
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(uint8_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint8_t const o)
+{
     u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(int16_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int16_t const o)
+{
     u.v512[0] = _mm512_set1_epi16(o);
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(uint16_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint16_t const o)
+{
     u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(int32_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int32_t const o)
+{
     u.v512[0] = _mm512_set1_epi32(o);
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(uint32_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint32_t const o)
+{
     u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(int64_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int64_t const o)
+{
     u.v512[0] = _mm512_set1_epi64(o);
 }
 
-template <>
-template <>
-really_inline SuperVector<64>::SuperVector(uint64_t const o) {
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint64_t const o)
+{
     u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
 }
 
 // Constants
-template <> really_inline SuperVector<64> SuperVector<64>::Ones(void) {
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones(void)
+{
     return {_mm512_set1_epi8(0xFF)};
 }
 
-template <> really_inline SuperVector<64> SuperVector<64>::Zeroes(void) {
+template<>
+really_inline SuperVector<64> SuperVector<64>::Zeroes(void)
+{
     return {_mm512_set1_epi8(0)};
 }
 
 // Methods
 template <>
-really_inline void SuperVector<64>::operator=(SuperVector<64> const &o) {
+really_inline void SuperVector<64>::operator=(SuperVector<64> const &o)
+{
     u.v512[0] = o.u.v512[0];
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator&(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator&(SuperVector<64> const &b) const
+{
     return {_mm512_and_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator|(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator|(SuperVector<64> const &b) const
+{
     return {_mm512_or_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator^(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator^(SuperVector<64> const &b) const
+{
     return {_mm512_xor_si512(u.v512[0], b.u.v512[0])};
 }
 
-template <> really_inline SuperVector<64> SuperVector<64>::operator!() const {
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator!() const
+{
     return {_mm512_xor_si512(u.v512[0], u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::opandnot(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b) const
+{
     return {_mm512_andnot_si512(u.v512[0], b.u.v512[0])};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator==(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator!=(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator>(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator<(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator>=(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator<=(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
+{
     SuperVector<64>::comparemask_type mask =
         _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::eq(SuperVector<64> const &b) const {
+really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) const
+{
     return (*this == b);
 }
 
@@ -1663,44 +1445,51 @@ SuperVector<64>::iteration_mask(
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const
+{
     return {_mm512_slli_epi16(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const
+{
     return {_mm512_slli_epi32(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const
+{
     return {_mm512_slli_epi64(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const
+{
     return {_mm512_bslli_epi128(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const
+{
     return {};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const
+{
     return {};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshl_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_imm() const
+{
     return vshl_512_imm<N>();
 }
 
@@ -1712,44 +1501,51 @@ really_inline SuperVector<64> SuperVector<64>::vshl_imm() const {
 // }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const
+{
     return {_mm512_srli_epi16(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const
+{
     return {_mm512_srli_epi32(u.v512[0], N)};
 }
-
+  
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const
+{
     return {_mm512_srli_epi64(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const
+{
     return {_mm512_bsrli_epi128(u.v512[0], N)};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const
+{
     return {};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const
+{
     return {};
 }
 
 template <>
-template <uint8_t N>
-really_inline SuperVector<64> SuperVector<64>::vshr_imm() const {
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_imm() const
+{
     return vshr_512_imm<N>();
 }
 
@@ -1767,186 +1563,150 @@ template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const;
 #endif
 
 // template <>
-// really_inline SuperVector<64> SuperVector<64>::vshl_8  (uint8_t const N)
-// const
+// really_inline SuperVector<64> SuperVector<64>::vshl_8  (uint8_t const N) const
 // {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return
-//     {_mm_slli_epi8(v->u.v128[0], i)}; }); if (N == 16) return Zeroes();
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) return Zeroes();
 // }
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_16(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_slli_epi16(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi16(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_32(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_slli_epi32(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi32(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_64(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_slli_epi64(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi64(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_bslli_epi128(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bslli_epi128(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const
+{
     return vshl_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const
+{
     return vshl_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const
+{
     return vshl_512(N);
 }
 
 // template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N)
-// const
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 // {
 //     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i)
-//     result = {_mm_srli_epi8(v->u.v128[0], i)}; }); if (N == 16) result =
-//     Zeroes(); return result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
 // }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_16(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_srli_epi16(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi16(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_32(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_srli_epi32(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi32(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_64(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 16)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_srli_epi64(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi64(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const {
-    if (N == 0)
-        return *this;
-    if (N == 64)
-        return Zeroes();
+really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 64>::iterator([&, v = this](auto const i) {
-        constexpr uint8_t n = i.value;
-        if (N == n)
-            result = {_mm512_bsrli_epi128(v->u.v512[0], n)};
-    });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bsrli_epi128(v->u.v512[0], n)}; });
     return result;
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const
+{
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const
+{
     return vshr_128(N);
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const
+{
     return vshr_512(N);
 }
 
-template <>
-really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N) {
-    if (N == 0)
-        return Ones();
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
     if (N >= 32)
         return {SuperVector<32>::Ones_vshr(N - 32), SuperVector<32>::Zeroes()};
     else
         return {SuperVector<32>::Ones(), SuperVector<32>::Ones_vshr(N)};
 }
 
-template <>
-really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N) {
-    if (N == 0)
-        return Ones();
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
     if (N >= 32)
         return {SuperVector<32>::Zeroes(), SuperVector<32>::Ones_vshl(N - 32)};
     else
@@ -1954,8 +1714,8 @@ really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N) {
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator>>(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
+{
     if (N == 0) {
         return *this;
     } else if (N < 32) {
@@ -1977,8 +1737,8 @@ SuperVector<64>::operator>>(uint8_t const N) const {
 }
 
 template <>
-really_inline SuperVector<64>
-SuperVector<64>::operator<<(uint8_t const N) const {
+really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
+{
     if (N == 0) {
         return *this;
     } else if (N < 32) {
@@ -2000,47 +1760,48 @@ SuperVector<64>::operator<<(uint8_t const N) const {
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr) {
+really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr)
+{
     return {_mm512_loadu_si512((const m512 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::load(void const *ptr) {
+really_inline SuperVector<64> SuperVector<64>::load(void const *ptr)
+{
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
     ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
     return {_mm512_load_si512((const m512 *)ptr)};
 }
 
 template <>
-really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr,
-                                                           uint8_t const len) {
+really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint8_t const len)
+{
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask = %016llx\n", mask);
-    SuperVector<64> v =
-        _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
+    SuperVector<64> v = _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
     v.print8("v");
     return v;
 }
 
-template <>
-template <>
-really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b) {
+template<>
+template<>
+really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b)
+{
     return {_mm512_shuffle_epi8(u.v512[0], b.u.v512[0])};
 }
 
-template <>
-really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b,
-                                                            uint8_t const len) {
+template<>
+really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, uint8_t const len)
+{
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask = %016llx\n", mask);
     return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])};
 }
 
-template <>
-really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l,
-                                                      int8_t offset) {
-#if defined(HAVE__BUILTIN_CONSTANT_P) &&                                       \
-    !(defined(__GNUC__) && (__GNUC__ == 14))
+template<>
+really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && (__GNUC__ == 14))
     if (__builtin_constant_p(offset)) {
         if (offset == 16) {
             return *this;
@@ -2049,21 +1810,21 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l,
         }
     }
 #endif
-    if (offset == 0) {
+    if(offset == 0) {
         return *this;
-    } else if (offset < 32) {
+    } else if (offset < 32){
         SuperVector<32> lo256 = u.v256[0];
         SuperVector<32> hi256 = u.v256[1];
         SuperVector<32> o_lo256 = l.u.v256[0];
-        SuperVector<32> carry1 = hi256.alignr(lo256, offset);
-        SuperVector<32> carry2 = o_lo256.alignr(hi256, offset);
+        SuperVector<32> carry1 = hi256.alignr(lo256,offset);
+        SuperVector<32> carry2 = o_lo256.alignr(hi256,offset);
         return SuperVector(carry1, carry2);
-    } else if (offset <= 64) {
+    } else if (offset <= 64){
         SuperVector<32> hi256 = u.v256[1];
         SuperVector<32> o_lo256 = l.u.v256[0];
         SuperVector<32> o_hi256 = l.u.v256[1];
         SuperVector<32> carry1 = o_lo256.alignr(hi256, offset - 32);
-        SuperVector<32> carry2 = o_hi256.alignr(o_lo256, offset - 32);
+        SuperVector<32> carry2 = o_hi256.alignr(o_lo256,offset -32);
         return SuperVector(carry1, carry2);
     } else {
         return *this;

From 1e614dc86145a3163970efc9b9a11aa3eda88bee Mon Sep 17 00:00:00 2001
From: "G.E." <gregory.economou@vectorcamp.gr>
Date: Wed, 17 Apr 2024 15:40:52 +0300
Subject: [PATCH 16/56] enable the rpath hack on all gcc13, and on arm/gcc12

---
 cmake/osdetection.cmake | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cmake/osdetection.cmake b/cmake/osdetection.cmake
index 8bfbd3bd..2cef0b94 100644
--- a/cmake/osdetection.cmake
+++ b/cmake/osdetection.cmake
@@ -4,14 +4,12 @@ endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
 
 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     set(FREEBSD true)
-    if(ARCH_AARCH64)
-        set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-        #FIXME: find a nicer and more general way of doing this
-        if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12")
-            set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
-        elseif(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
-            set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
-        endif()
+    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #FIXME: find a nicer and more general way of doing this
+    if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
+        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
+    elseif(ARCH_AARCH64 AND (CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12"))
+        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
     endif()
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 

From cdc0d47cde9e350e510cbe3b56b1cac767cefdad Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 17 Apr 2024 17:23:11 +0300
Subject: [PATCH 17/56] Update SIMDe

---
 simde | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simde b/simde
index aae22459..416091eb 160000
--- a/simde
+++ b/simde
@@ -1 +1 @@
-Subproject commit aae22459fa284e9fc2b7d4b8e4571afa0418125f
+Subproject commit 416091ebdb9e901b29d026633e73167d6353a0b0

From 50fdcaf35733f30ff045d2dac88909c65f612a00 Mon Sep 17 00:00:00 2001
From: "G.E." <gregory.economou@vectorcamp.gr>
Date: Wed, 17 Apr 2024 23:03:09 +0300
Subject: [PATCH 18/56] readme edit

---
 README.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2e68d2e6..d5f0892c 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,7 @@ export CXX="/usr/pkg/gcc12/bin/g++"
 ```
 
 In FreeBSD similarly, you might want to install a different compiler.
+If you want to use gcc, it is recommended to use gcc12.
 You will also, as in NetBSD, need to install cmake, sqlite, boost and ragel packages.
 Using the example of gcc12 from pkg:
 installing the desired compiler: 
@@ -175,12 +176,6 @@ export CXX="/usr/local/bin/g++12"
 
 Then continue with the build as below. 
 
-A note about running in FreeBSD: if you built a dynamically linked binary
-with an alternative compiler, the libraries specific to the compiler that
-built the binary will probably not be found and the base distro libraries
-in /lib will be found instead. Adjust LD_LIBRARY_PATH appropriately. For
-example, with gcc12 installed from pkg, one would want to use
-```export LD_LIBRARY_PATH=/usr/local/lib/gcc12/``` 
 
 ## Configure & build
 

From acbef47c74d842c60284e05e634dbdb6c27e9650 Mon Sep 17 00:00:00 2001
From: "G.E." <gregory.economou@vectorcamp.gr>
Date: Thu, 18 Apr 2024 16:16:06 +0300
Subject: [PATCH 19/56] tiny change to readme

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index d5f0892c..483b2cad 100644
--- a/README.md
+++ b/README.md
@@ -165,7 +165,6 @@ the environment variables to point to this compiler:
 export CC="/usr/local/bin/gcc"
 export CXX="/usr/local/bin/g++"
 ```
-
 A further note in FreeBSD, on the PowerPC and ARM platforms, 
 the gcc12 package installs to a slightly different name, on FreeBSD/ppc, 
 gcc12 will be found using: 

From cfa8397e97562beb522c68494343422d120ab3bc Mon Sep 17 00:00:00 2001
From: Gregory Economou <groik@devel-freebsd14-aarch64-01.lan>
Date: Fri, 19 Apr 2024 12:32:00 +0300
Subject: [PATCH 20/56] static dispatch for fat runtimes. eliminates the need
 for ifunc.

---
 src/dispatcher.c | 249 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 209 insertions(+), 40 deletions(-)

diff --git a/src/dispatcher.c b/src/dispatcher.c
index a817e744..e213bbe6 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +31,39 @@
 #include "hs_common.h"
 #include "hs_runtime.h"
 #include "ue2common.h"
+
+/* Streamlining the dispatch to eliminate runtime checking/branching:
+ * What we want to do is, first call to the function will run the resolve
+ * code and set the static resolved/dispatch pointer to point to the
+ * correct function. Subsequent calls to the function will go directly to
+ * the resolved ptr. The simplest way to accomplish this is, to
+ * initially set the pointer to the resolve function.
+ * To accomplish this in a manner invisible to the user,
+ * we do involve some rather ugly/confusing macros in here.
+ * There are four macros that assemble the code for each function
+ * we want to dispatch in this manner:
+ * CREATE_DISPATCH
+ * this generates the declarations for the candidate target functions,
+ * for the fat_dispatch function pointer, for the resolve_ function,
+ * points the function pointer to the resolve function, and contains
+ * most of the definition of the resolve function. The very end of the
+ * resolve function is completed by the next macro, because in the
+ * CREATE_DISPATCH macro we have the argument list with the arg declarations,
+ * which is needed to generate correct function signatures, but we
+ * can't generate from this, in a macro, a _call_ to one of those functions.
+ * CONNECT_ARGS_1
+ * this macro fills in the actual call at the end of the resolve function,
+ * with the correct arg list. hence the name connect args.
+ * CONNECT_DISPATCH_2
+ * this macro likewise gives up the beginning of the definition of the
+ * actual entry point function (the 'real name' that's called by the user)
+ * but again in the pass-through call, cannot invoke the target without
+ * getting the arg list , which is supplied by the final macro,
+ * CONNECT_ARGS_3
+ *
+ */
+
+
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #include "util/join.h"
@@ -57,30 +91,38 @@
         return (RTYPE)HS_ARCH_ERROR;                                           \
     }                                                                          \
                                                                                \
-    /* resolver */                                                             \
-    static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
-        if (check_avx512vbmi()) {                                              \
-            return JOIN(avx512vbmi_, NAME);                                    \
-        }                                                                      \
-        if (check_avx512()) {                                                  \
-            return JOIN(avx512_, NAME);                                        \
-        }                                                                      \
-        if (check_avx2()) {                                                    \
-            return JOIN(avx2_, NAME);                                          \
-        }                                                                      \
-        if (check_sse42() && check_popcnt()) {                                 \
-            return JOIN(corei7_, NAME);                                        \
-        }                                                                      \
-        if (check_ssse3()) {                                                   \
-            return JOIN(core2_, NAME);                                         \
-        }                                                                      \
-        /* anything else is fail */                                            \
-        return JOIN(error_, NAME);                                             \
-    }                                                                          \
+    /* dispatch routing pointer for this function */                           \
+    /* initially point it at the resolve function */                           \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__);                            \
+    static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) =                  \
+        &JOIN(resolve_, NAME);                                                 \
                                                                                \
-    /* function */                                                             \
-    HS_PUBLIC_API                                                              \
-    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+    /* resolver */                                                             \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) {                           \
+        if (check_avx512vbmi()) {                                              \
+            fat_dispatch_ ## NAME = &JOIN(avx512vbmi_, NAME);                  \
+        }                                                                      \
+        else if (check_avx512()) {                                             \
+            fat_dispatch_ ## NAME = &JOIN(avx512_, NAME);                      \
+        }                                                                      \
+        else if (check_avx2()) {                                               \
+            fat_dispatch_ ## NAME = &JOIN(avx2_, NAME);                        \
+        }                                                                      \
+        else if (check_sse42() && check_popcnt()) {                            \
+            fat_dispatch_ ## NAME = &JOIN(corei7_, NAME);                      \
+        }                                                                      \
+        else if (check_ssse3()) {                                              \
+            fat_dispatch_ ## NAME = &JOIN(core2_, NAME);                       \
+        } else {                                                               \
+            /* anything else is fail */                                        \
+            fat_dispatch_ ## NAME = &JOIN(error_, NAME);                       \
+        }                                                                      \
+
+
+
+/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
+
+
 
 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
@@ -97,99 +139,226 @@
         return (RTYPE)HS_ARCH_ERROR;                                           \
     }                                                                          \
                                                                                \
-    /* resolver */                                                             \
-    static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
-        if (check_sve2()) {                                                    \
-            return JOIN(sve2_, NAME);                                          \
-        }                                                                      \
-        if (check_sve()) {                                                     \
-            return JOIN(sve_, NAME);                                           \
-        }                                                                      \
-        if (check_neon()) {                                                    \
-            return JOIN(neon_, NAME);                                          \
-        }                                                                      \
-        /* anything else is fail */                                            \
-        return JOIN(error_, NAME);                                             \
-    }                                                                          \
+    /* dispatch routing pointer for this function */                           \
+    /* initially point it at the resolve function */                           \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__);                            \
+    static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) =                  \
+        &JOIN(resolve_, NAME);                                                 \
                                                                                \
-    /* function */                                                             \
-    HS_PUBLIC_API                                                              \
-    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+    /* resolver */                                                             \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) {                           \
+        if (check_sve2()) {                                                    \
+            fat_dispatch_ ## NAME = &JOIN(sve2_, NAME);                        \
+        }                                                                      \
+        else if (check_sve()) {                                                \
+            fat_dispatch_ ## NAME = &JOIN(sve_, NAME);                         \
+        }                                                                      \
+        else if (check_neon()) {                                               \
+            fat_dispatch_ ## NAME = &JOIN(neon_, NAME);                        \
+        } else {                                                               \
+            /* anything else is fail */                                        \
+            fat_dispatch_ ## NAME = &JOIN(error_, NAME);                       \
+        }                                                                      \
+
+
+/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
+
 
 #endif
 
+
+#define CONNECT_ARGS_1(RTYPE, NAME, ...)                                       \
+        return (*fat_dispatch_ ## NAME)(__VA_ARGS__);                          \
+    }                                                                          \
+
+
+#define CONNECT_DISPATCH_2(RTYPE, NAME, ...)                                   \
+    /* new function */                                                         \
+    HS_PUBLIC_API                                                              \
+    RTYPE NAME(__VA_ARGS__) {                                                  \
+
+
+#define CONNECT_ARGS_3(RTYPE, NAME, ...)                                       \
+        return (*fat_dispatch_ ## NAME)(__VA_ARGS__);                          \
+    }                                                                          \
+
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
+
+/* this gets a bit ugly to compose the static redirect functions,
+ * as we necessarily need first the typed arg list and then just the arg
+ * names, twice in a row, to define the redirect function and the
+ * dispatch function call */
+
 CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
                 unsigned length, unsigned flags, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *userCtx);
+CONNECT_ARGS_1(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
+                unsigned length, unsigned flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *userCtx);
+CONNECT_ARGS_3(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
 
 CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
                 size_t *stream_size);
+CONNECT_ARGS_1(hs_error_t, hs_stream_size, database, stream_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_stream_size, const hs_database_t *database,
+                size_t *stream_size);
+CONNECT_ARGS_3(hs_error_t, hs_stream_size, database, stream_size);
 
 CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
                 size_t *size);
+CONNECT_ARGS_1(hs_error_t, hs_database_size, db, size);
+CONNECT_DISPATCH_2(hs_error_t, hs_database_size, const hs_database_t *db,
+                size_t *size);
+CONNECT_ARGS_3(hs_error_t, hs_database_size, db, size);
+
 CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, dbIsValid, db);
+CONNECT_DISPATCH_2(hs_error_t, dbIsValid, const hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, dbIsValid, db);
+
 CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, hs_free_database, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_free_database, hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, hs_free_database, db);
 
 CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
                 unsigned int flags, hs_stream_t **stream);
+CONNECT_ARGS_1(hs_error_t, hs_open_stream, db, flags, stream);
+CONNECT_DISPATCH_2(hs_error_t, hs_open_stream, const hs_database_t *db,
+                unsigned int flags, hs_stream_t **stream);
+CONNECT_ARGS_3(hs_error_t, hs_open_stream, db, flags, stream);
 
 CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
                 unsigned int length, unsigned int flags, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_1(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
+                unsigned int length, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_3(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
 
 CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
                 hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_1(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
+CONNECT_DISPATCH_2(hs_error_t, hs_close_stream, hs_stream_t *id,
+                hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_3(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
 
 CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
                 const char *const *data, const unsigned int *length,
                 unsigned int count, unsigned int flags, hs_scratch_t *scratch,
                 match_event_handler onevent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan_vector, const hs_database_t *db,
+                const char *const *data, const unsigned int *length,
+                unsigned int count, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onevent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
 
 CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
+CONNECT_ARGS_1(hs_error_t, hs_database_info, db, info);
+CONNECT_DISPATCH_2(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
+CONNECT_ARGS_3(hs_error_t, hs_database_info, db, info);
 
 CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
                 const hs_stream_t *from_id);
+CONNECT_ARGS_1(hs_error_t, hs_copy_stream, to_id, from_id);
+CONNECT_DISPATCH_2(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
+                const hs_stream_t *from_id);
+CONNECT_ARGS_3(hs_error_t, hs_copy_stream, to_id, from_id);
 
 CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
                 unsigned int flags, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_stream, hs_stream_t *id,
+                unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
 
 CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
                 const hs_stream_t *from_id, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
+                const hs_stream_t *from_id, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
 
 CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
                 char **bytes, size_t *length);
+CONNECT_ARGS_1(hs_error_t, hs_serialize_database, db, bytes, length);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialize_database, const hs_database_t *db,
+                char **bytes, size_t *length);
+CONNECT_ARGS_3(hs_error_t, hs_serialize_database, db, bytes, length);
 
 CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
                 const size_t length, hs_database_t **db);
+CONNECT_ARGS_1(hs_error_t, hs_deserialize_database, bytes, length, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database, const char *bytes,
+                const size_t length, hs_database_t **db);
+CONNECT_ARGS_3(hs_error_t, hs_deserialize_database, bytes, length, db);
 
 CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
                 const size_t length, hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, hs_deserialize_database_at, bytes, length, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database_at, const char *bytes,
+                const size_t length, hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, hs_deserialize_database_at, bytes, length, db);
 
 CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
                 size_t length, char **info);
+CONNECT_ARGS_1(hs_error_t, hs_serialized_database_info, bytes, length, info);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_info, const char *bytes,
+                size_t length, char **info);
+CONNECT_ARGS_3(hs_error_t, hs_serialized_database_info, bytes, length, info);
 
 CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
                 const size_t length, size_t *deserialized_size);
+CONNECT_ARGS_1(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_size, const char *bytes,
+                const size_t length, size_t *deserialized_size);
+CONNECT_ARGS_3(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
 
 CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
                 char *buf, size_t buf_space, size_t *used_space);
+CONNECT_ARGS_1(hs_error_t, hs_compress_stream, stream,
+                buf, buf_space, used_space);
+CONNECT_DISPATCH_2(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
+                char *buf, size_t buf_space, size_t *used_space);
+CONNECT_ARGS_3(hs_error_t, hs_compress_stream, stream,
+                buf, buf_space, used_space);
 
 CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
                 hs_stream_t **stream, const char *buf,size_t buf_size);
+CONNECT_ARGS_1(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_expand_stream, const hs_database_t *db,
+                hs_stream_t **stream, const char *buf,size_t buf_size);
+CONNECT_ARGS_3(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
 
 CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
                 const char *buf, size_t buf_size, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_and_expand_stream, to_stream,
+                buf, buf_size, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
+                const char *buf, size_t buf_size, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
+                buf, buf_size, scratch, onEvent, context);
 
 /** INTERNALS **/
 
 CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
+CONNECT_ARGS_1(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
+CONNECT_DISPATCH_2(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
+CONNECT_ARGS_3(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
 
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic pop
+

From c7439a605ef7593b4992ef6c62fc4340be93c635 Mon Sep 17 00:00:00 2001
From: Gregory Economou <groik@devel-freebsd14-aarch64-01.lan>
Date: Mon, 22 Apr 2024 12:07:15 +0300
Subject: [PATCH 21/56] removed LD_LIBRARY_PATH comment from readme

---
 README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/README.md b/README.md
index 2e68d2e6..51cd60da 100644
--- a/README.md
+++ b/README.md
@@ -175,13 +175,6 @@ export CXX="/usr/local/bin/g++12"
 
 Then continue with the build as below. 
 
-A note about running in FreeBSD: if you built a dynamically linked binary
-with an alternative compiler, the libraries specific to the compiler that
-built the binary will probably not be found and the base distro libraries
-in /lib will be found instead. Adjust LD_LIBRARY_PATH appropriately. For
-example, with gcc12 installed from pkg, one would want to use
-```export LD_LIBRARY_PATH=/usr/local/lib/gcc12/``` 
-
 ## Configure & build
 
 In order to configure with `cmake` first create and cd into a build directory:

From c4bffd7cefe05d4b1112532f556e104a90026ae6 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 12:15:12 +0300
Subject: [PATCH 22/56] accessMoved cppcheck error

---
 src/rose/rose_build_merge.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index cddbb760..6dad3ecd 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1599,7 +1599,8 @@ void dedupeLeftfixesVariableLag(RoseBuildImpl &build) {
                 continue;
             }
         }
-        engine_groups[DedupeLeftKey(build, std::move(preds), left)].emplace_back(left);
+        auto preds_copy = preds;
+        engine_groups[DedupeLeftKey(build, std::move(preds_copy), left)].emplace_back(left);
     }
 
     /* We don't bother chunking as we expect deduping to be successful if the

From 06fc35321dddf2fefa19e812ffa43e0dde3bfb91 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 12:27:43 +0300
Subject: [PATCH 23/56] unsignedLessThanZero cppcheck

---
 src/parser/logical_combination.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp
index b75ca34f..a37f4e5f 100644
--- a/src/parser/logical_combination.cpp
+++ b/src/parser/logical_combination.cpp
@@ -284,7 +284,7 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical,
                 if (logical[i] == '(') {
                     paren += 1;
                 } else if (logical[i] == ')') {
-                    if (paren <= 0) {
+                    if (paren == 0) {
                         throw LocatedParseError("Not enough left parentheses");
                     }
                     paren -= 1;

From 182f7ddb4775ead881959326fb3004f51ccab2fc Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:47:21 +0300
Subject: [PATCH 24/56] useInitializationList

---
 src/util/graph_small_color_map.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/util/graph_small_color_map.h b/src/util/graph_small_color_map.h
index 249b7153..a85f4b77 100644
--- a/src/util/graph_small_color_map.h
+++ b/src/util/graph_small_color_map.h
@@ -102,10 +102,10 @@ public:
     using category = boost::read_write_property_map_tag;
 
     small_color_map(size_t n_in, const IndexMap &index_map_in)
-        : n(n_in), index_map(index_map_in) {
-        size_t num_bytes = (n + entries_per_byte - 1) / entries_per_byte;
-        data = std::make_shared<std::vector<unsigned char>>(num_bytes);
-        fill(small_color::white);
+    : n(n_in), 
+      index_map(index_map_in),
+      data(std::make_shared<std::vector<unsigned char>>((n_in + entries_per_byte - 1) / entries_per_byte)) {
+    fill(small_color::white);
     }
 
     void fill(small_color color) {

From 9d5753b215154a70c66739c9821f508bd3ce357e Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:48:12 +0300
Subject: [PATCH 25/56] comparisonOfBoolWithBoolError

---
 src/nfagraph/ng_limex_accel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 8bac753d..ac5174f3 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -321,7 +321,7 @@ struct DAccelScheme {
             bool cd_a = buildDvermMask(a.double_byte);
             bool cd_b = buildDvermMask(b.double_byte);
             if (cd_a != cd_b) {
-                return cd_a > cd_b;
+                return cd_a;
             }
         }
 

From 9316d65022efad69861e722dce1f49c9269611e8 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:48:35 +0300
Subject: [PATCH 26/56] redundantContinue

---
 src/nfagraph/ng_squash.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index 0b51792b..b06a7af0 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -589,7 +589,7 @@ void getHighlanderReporters(const NGHolder &g, const NFAVertex accept,
 
         verts.insert(v);
     next_vertex:
-        continue;
+        ;
     }
 }
 

From 73e00e2abce4865d67a4a45894ab014bab5951f5 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:48:51 +0300
Subject: [PATCH 27/56] funcArgOrderDifferent

---
 src/nfagraph/ng_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index a2d0d9b7..34199773 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -314,7 +314,7 @@ void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 
 /** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
  * accepts. */
-void reverseHolder(const NGHolder &g, NGHolder &out);
+void reverseHolder(const NGHolder &g_in, NGHolder &g);
 
 /** \brief Returns the delay or ~0U if the graph cannot match with
  * the trailing literal. */

From 8d3a5d7cf1ee81d9699e03c8e7ad388715087f5c Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:48:58 +0300
Subject: [PATCH 28/56] legacyUninitvar

---
 src/util/alloc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/alloc.cpp b/src/util/alloc.cpp
index 40004932..fb20f3d3 100644
--- a/src/util/alloc.cpp
+++ b/src/util/alloc.cpp
@@ -68,7 +68,7 @@ namespace ue2 {
 #endif
 
 void *aligned_malloc_internal(size_t size, size_t align) {
-    void *mem;
+    void *mem= nullptr;;
     int rv = posix_memalign(&mem, align, size);
     if (rv != 0) {
         DEBUG_PRINTF("posix_memalign returned %d when asked for %zu bytes\n",

From 52b0076f4f09809fff8a7ccf9c477cdd6643ae61 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 23 Apr 2024 14:49:10 +0300
Subject: [PATCH 29/56] accessMoved

---
 src/rose/rose_build_merge.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 6dad3ecd..1e6c9222 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1599,8 +1599,8 @@ void dedupeLeftfixesVariableLag(RoseBuildImpl &build) {
                 continue;
             }
         }
-        auto preds_copy = preds;
-        engine_groups[DedupeLeftKey(build, std::move(preds_copy), left)].emplace_back(left);
+        auto preds_copy = std::move(preds);
+        engine_groups[DedupeLeftKey(build, preds_copy , left)].emplace_back(left);
     }
 
     /* We don't bother chunking as we expect deduping to be successful if the

From 01dee390a98dd3f086d84dafe89a8350de924b7c Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Tue, 23 Apr 2024 19:08:24 +0300
Subject: [PATCH 30/56] ddressing some cppcheck warnings. yes this will be
 cleaned up in a following commit. tests pass.

---
 benchmarks/benchmarks.cpp        |  2 +-
 src/nfagraph/ng.cpp              |  3 ---
 src/nfagraph/ng_limex_accel.cpp  |  4 ++--
 src/nfagraph/ng_som_util.cpp     | 23 ++++++++++++-----------
 src/rose/rose_build_bytecode.cpp |  6 ++++--
 src/rose/rose_build_castle.cpp   |  2 +-
 tools/hscheck/main.cpp           |  2 +-
 util/ExpressionParser.rl         |  2 +-
 util/ng_corpus_generator.h       |  2 +-
 9 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index 91cab3f8..d8b77992 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -113,7 +113,7 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
         auto end = std::chrono::steady_clock::now();
         total_sec += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
         /*calculate transferred size*/
-        total_size = size * loops;
+        total_size = (u64a)size * (u64a)loops;
         /*calculate average time*/
         avg_time = total_sec / loops;
         /*convert microseconds to seconds*/
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index b2a87523..7189baad 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -193,9 +193,6 @@ void reduceGraph(NGHolder &g, som_type som, bool utf8,
 
     if (!som) {
         mergeCyclicDotStars(g);
-    }
-
-    if (!som) {
         removeSiblingsOfStartDotStar(g);
     }
 }
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 8bac753d..58149ad3 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -811,11 +811,11 @@ depth_done:
                 return true;
             }
         }
-    }
+    // } 
 
     // Second option: a two-byte shufti (i.e. less than eight 2-byte
     // literals)
-    if (depth > 1) {
+    // if (depth > 1) {
         for (unsigned int i = 0; i < (depth - 1); i++) {
             if (depthReach[i].count() * depthReach[i+1].count()
                 <= DOUBLE_SHUFTI_LIMIT) {
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 82277c06..01beec8d 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -267,17 +267,18 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
     boost::depth_first_search(c_g, visitor(backEdgeVisitor)
                                    .root_vertex(c_g.start));
 
-    for (const auto &e : be) {
-        NFAVertex s = source(e, c_g);
-        NFAVertex t = target(e, c_g);
-        DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
-        if (s != t) {
-            assert(0);
-            DEBUG_PRINTF("eek big cycle\n");
-            rv = true; /* big cycle -> eek */
-            goto exit;
-        }
-    }
+    // with be.clear right above does this ever run at all?
+    //for (const auto &e : be) {
+    //    NFAVertex s = source(e, c_g);
+    //    NFAVertex t = target(e, c_g);
+    //    DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
+    //    if (s != t) {
+    //        assert(0);
+    //        DEBUG_PRINTF("eek big cycle\n");
+    //        rv = true; /* big cycle -> eek */
+    //        goto exit;
+    //    }
+    //}
 
     DEBUG_PRINTF("checking acyclic+selfloop graph\n");
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 06f36582..2df3b3a3 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2975,7 +2975,8 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
             !lit_prog.empty()) {
             auto &cfrag = fragments[pfrag.included_frag_id];
             assert(pfrag.s.length() >= cfrag.s.length() &&
-                   !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
+                   !pfrag.s.any_nocase() == !cfrag.s.any_nocase());
+                   /** !pfrag.s.any_nocase() >= !cfrag.s.any_nocase()); **/
             u32 child_offset = cfrag.lit_program_offset;
             DEBUG_PRINTF("child %u offset %u\n", cfrag.fragment_id,
                          child_offset);
@@ -2993,7 +2994,8 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
         if (pfrag.included_delay_frag_id != INVALID_FRAG_ID &&
             !rebuild_prog.empty()) {
             auto &cfrag = fragments[pfrag.included_delay_frag_id];
-            assert(pfrag.s.length() >= cfrag.s.length() &&
+            /** assert(pfrag.s.length() >= cfrag.s.length() && **/
+            assert(pfrag.s.length() == cfrag.s.length() &&
                    !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
             u32 child_offset = cfrag.delay_program_offset;
             DEBUG_PRINTF("child %u offset %u\n", cfrag.fragment_id,
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index f3357982..2a1dcdd9 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -170,7 +170,7 @@ void renovateCastle(RoseBuildImpl &tbi, CastleProto *castle,
                 return; /* bail - TODO: be less lazy */
             }
 
-            vector<CharReach> rem_local_cr;
+            //vector<CharReach> rem_local_cr;
             u32 ok_count = 0;
             for (auto it = e.s.end() - g[v].left.lag; it != e.s.end(); ++it) {
                 if (!isSubsetOf(*it, cr)) {
diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp
index f3e9419a..2c73baf5 100644
--- a/tools/hscheck/main.cpp
+++ b/tools/hscheck/main.cpp
@@ -97,7 +97,7 @@ unsigned int countFailures = 0;
 
 class ParsedExpr {
 public:
-    ParsedExpr(string regex_in, unsigned int flags_in, hs_expr_ext ext_in)
+    ParsedExpr(string regex_in, unsigned int flags_in, hs_expr_ext& ext_in)
         : regex(regex_in), flags(flags_in), ext(ext_in) {}
     ~ParsedExpr() {}
     string regex;
diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl
index b93f069d..ad201fec 100644
--- a/util/ExpressionParser.rl
+++ b/util/ExpressionParser.rl
@@ -152,7 +152,7 @@ bool HS_CDECL readExpression(const std::string &input, std::string &expr,
     UNUSED const char *eof = pe;
     UNUSED const char *ts = p, *te = p;
     int cs;
-    UNUSED int act;
+    //UNUSED int act;
 
     assert(p);
     assert(pe);
diff --git a/util/ng_corpus_generator.h b/util/ng_corpus_generator.h
index f230a10d..cd84a9ab 100644
--- a/util/ng_corpus_generator.h
+++ b/util/ng_corpus_generator.h
@@ -47,7 +47,7 @@ class NGHolder;
 } // namespace ue2
 
 struct CorpusGenerationFailure {
-    explicit CorpusGenerationFailure(const std::string s) :
+    explicit CorpusGenerationFailure(const std::string& s) :
         message(std::move(s)) {}
     std::string message;
 };

From 8dabc86a69cb5a36336807f1c228f3ab7e77f4a0 Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Tue, 23 Apr 2024 23:46:08 +0300
Subject: [PATCH 31/56] removed commented lines.

---
 src/nfagraph/ng_limex_accel.cpp |  2 --
 src/nfagraph/ng_som_util.cpp    | 13 -------------
 src/rose/rose_build_castle.cpp  |  1 -
 3 files changed, 16 deletions(-)

diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 58149ad3..d31f84c0 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -811,11 +811,9 @@ depth_done:
                 return true;
             }
         }
-    // } 
 
     // Second option: a two-byte shufti (i.e. less than eight 2-byte
     // literals)
-    // if (depth > 1) {
         for (unsigned int i = 0; i < (depth - 1); i++) {
             if (depthReach[i].count() * depthReach[i+1].count()
                 <= DOUBLE_SHUFTI_LIMIT) {
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 01beec8d..0c44f063 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -267,19 +267,6 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
     boost::depth_first_search(c_g, visitor(backEdgeVisitor)
                                    .root_vertex(c_g.start));
 
-    // with be.clear right above does this ever run at all?
-    //for (const auto &e : be) {
-    //    NFAVertex s = source(e, c_g);
-    //    NFAVertex t = target(e, c_g);
-    //    DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
-    //    if (s != t) {
-    //        assert(0);
-    //        DEBUG_PRINTF("eek big cycle\n");
-    //        rv = true; /* big cycle -> eek */
-    //        goto exit;
-    //    }
-    //}
-
     DEBUG_PRINTF("checking acyclic+selfloop graph\n");
 
     rv = !firstMatchIsFirst(c_g);
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index 2a1dcdd9..990f0c55 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -170,7 +170,6 @@ void renovateCastle(RoseBuildImpl &tbi, CastleProto *castle,
                 return; /* bail - TODO: be less lazy */
             }
 
-            //vector<CharReach> rem_local_cr;
             u32 ok_count = 0;
             for (auto it = e.s.end() - g[v].left.lag; it != e.s.end(); ++it) {
                 if (!isSubsetOf(*it, cr)) {

From 91d9631c973a108846a930e634580b4d66bb7947 Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 24 Apr 2024 00:04:59 +0300
Subject: [PATCH 32/56] fixed some const issues

---
 tools/hscheck/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp
index 2c73baf5..87bedf75 100644
--- a/tools/hscheck/main.cpp
+++ b/tools/hscheck/main.cpp
@@ -97,12 +97,12 @@ unsigned int countFailures = 0;
 
 class ParsedExpr {
 public:
-    ParsedExpr(string regex_in, unsigned int flags_in, hs_expr_ext& ext_in)
+    ParsedExpr(string regex_in, unsigned int flags_in, const hs_expr_ext& ext_in)
         : regex(regex_in), flags(flags_in), ext(ext_in) {}
     ~ParsedExpr() {}
     string regex;
     unsigned int flags;
-    hs_expr_ext ext;
+    const hs_expr_ext& ext;
 };
 
 typedef map<unsigned int, ParsedExpr> ExprExtMap;

From d7006a7c8536d382e43c05ec006a1252de05c71e Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 24 Apr 2024 00:06:08 +0300
Subject: [PATCH 33/56] removed another commented line

---
 util/ExpressionParser.rl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl
index ad201fec..02761b32 100644
--- a/util/ExpressionParser.rl
+++ b/util/ExpressionParser.rl
@@ -152,7 +152,6 @@ bool HS_CDECL readExpression(const std::string &input, std::string &expr,
     UNUSED const char *eof = pe;
     UNUSED const char *ts = p, *te = p;
     int cs;
-    //UNUSED int act;
 
     assert(p);
     assert(pe);

From 9b9df1b3974e2680eed0c4923991e7cf72cc32a1 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 11:07:23 +0300
Subject: [PATCH 34/56] invalidPrintfArgType_sint

---
 src/parser/ComponentRepeat.cpp       | 2 +-
 unit/internal/multi_bit_compress.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/parser/ComponentRepeat.cpp b/src/parser/ComponentRepeat.cpp
index 7090459f..919dbccb 100644
--- a/src/parser/ComponentRepeat.cpp
+++ b/src/parser/ComponentRepeat.cpp
@@ -320,7 +320,7 @@ void ComponentRepeat::wireRepeats(GlushkovBuildState &bs) {
         }
     }
 
-    DEBUG_PRINTF("wiring up %d optional repeats\n", copies - m_min);
+    DEBUG_PRINTF("wiring up %u optional repeats\n", copies - m_min);
     for (u32 rep = MAX(m_min, 1); rep < copies; rep++) {
         vector<PositionInfo> lasts = m_lasts[rep - 1];
         if (rep != m_min) {
diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp
index 14c3f480..e0ec475c 100644
--- a/unit/internal/multi_bit_compress.cpp
+++ b/unit/internal/multi_bit_compress.cpp
@@ -46,7 +46,7 @@ UNUSED
 static
 void mmbit_display(const u8 *bits, u32 total_bits) {
     for (u32 i = 0; i < mmbit_size(total_bits); i += 8) {
-        printf("block %d:", i / 8);
+        printf("block %u:", i / 8);
         for (s32 j = 7; j >= 0; j--) {
             u8 a = (*(bits + i + j));
             printf(" %02x", a);
@@ -72,7 +72,7 @@ UNUSED
 static
 void mmbit_display_comp(const u8 *bits, u32 comp_size) {
     for (u32 i = 0; i < comp_size; i += 8) {
-        printf("block %d:", i / 8);
+        printf("block %u:", i / 8);
         for (s32 j = 7; j >= 0; j--) {
             u8 a = (*(bits + i + j));
             printf(" %02x", a);

From e6c884358e96b15ca57dbfecd8cfff8edcf80f96 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 11:13:02 +0300
Subject: [PATCH 35/56] uninitvar

---
 unit/internal/pqueue.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unit/internal/pqueue.cpp b/unit/internal/pqueue.cpp
index a0a37990..bd7e4650 100644
--- a/unit/internal/pqueue.cpp
+++ b/unit/internal/pqueue.cpp
@@ -245,7 +245,7 @@ TEST(pqueue, queue1) {
     u32 in[] = {1, 2, 3, 4, 5, 6, 7, 8};
     u32 expected[] = {4, 5, 6, 7, 8, 3, 2, 1};
     u32 temp[ARRAY_LENGTH(in)];
-    u32 output[ARRAY_LENGTH(in)];
+    u32 output[ARRAY_LENGTH(in)] = {0};
 
     u32 queue_size = 0;
     u32 i = 0, o = 0;
@@ -275,7 +275,7 @@ TEST(pqueue, queue2) {
     u32 in[] = {8, 7, 6, 5, 4, 3, 2, 1};
     u32 expected[] = {8, 7, 6, 5, 4, 3, 2, 1};
     u32 temp[ARRAY_LENGTH(in)];
-    u32 output[ARRAY_LENGTH(in)];
+    u32 output[ARRAY_LENGTH(in)] = {0};
 
     u32 queue_size = 0;
     u32 i = 0, o = 0;
@@ -301,7 +301,7 @@ TEST(pqueue, queue3) {
     u32 in[] = {1, 8, 2, 7, 3, 6, 4, 5};
     u32 expected[] = {8, 7, 6, 4, 5, 3, 2, 1};
     u32 temp[ARRAY_LENGTH(in)];
-    u32 output[ARRAY_LENGTH(in)];
+    u32 output[ARRAY_LENGTH(in)] = {0};
 
     u32 queue_size = 0;
     u32 i = 0, o = 0;

From adda613f516eb067c457dce9a5df2101ac4d9708 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 11:13:28 +0300
Subject: [PATCH 36/56] shiftTooManyBitsSigned

---
 unit/internal/bitutils.cpp    | 4 ++--
 unit/internal/supervector.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index 8af8f9a4..6adfc2d6 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -62,7 +62,7 @@ u32 our_clzll(u64a x) {
 TEST(BitUtils, findAndClearLSB32_1) {
     // test that it can find every single-bit case
     for (unsigned int i = 0; i < 32; i++) {
-        u32 input = 1 << i;
+        u32 input = 1U << i;
         u32 idx = findAndClearLSB_32(&input);
         EXPECT_EQ(i, idx);
         EXPECT_EQ(0U, input);
@@ -112,7 +112,7 @@ TEST(BitUtils, findAndClearLSB64_2) {
 TEST(BitUtils, findAndClearMSB32_1) {
     // test that it can find every single-bit case
     for (unsigned int i = 0; i < 32; i++) {
-        u32 input = 1 << i;
+        u32 input = 1U << i;
         u32 idx = findAndClearMSB_32(&input);
         EXPECT_EQ(i, idx);
         EXPECT_EQ(0U, input);
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index 2432e598..ac3daf2a 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -508,7 +508,7 @@ TEST(SuperVectorUtilsTest,Movemask256c){
     u8 vec2[32] = {0};
     u32 r = rand() % 100 + 1;
     for(int i=0; i<32; i++) {
-        if (r & (1 << i)) {
+        if (r & (1U << i)) {
             vec[i] = 0xff;
         }
     }

From fd3e251afa1240a4c85bb472abb171a803a56690 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 12:40:55 +0300
Subject: [PATCH 37/56] redundantInitialization

---
 src/nfa/limex_compile.cpp         | 2 +-
 src/nfa/mpv.c                     | 8 ++++----
 src/nfagraph/ng_anchored_dots.cpp | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index f84cdc32..2ec65552 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1572,7 +1572,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
 static
 int getLimexScore(const build_info &args, u32 nShifts) {
     const NGHolder &h = args.h;
-    u32 maxVarShift = nShifts;
+    u32 maxVarShift;
     int score = 0;
 
     score += SHIFT_COST * nShifts;
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index cba3d159..2b1b5c5f 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -512,7 +512,7 @@ size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
 
     verm_restart:;
         assert(buf[curr] == kp->u.verm.c);
-        size_t test = curr;
+        size_t test;
         if (curr + min_rep < length) {
             test = curr + min_rep;
         } else {
@@ -534,7 +534,7 @@ size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
         m128 hi = kp->u.shuf.mask_hi;
     shuf_restart:
         assert(do_single_shufti(lo, hi, buf[curr]));
-        size_t test = curr;
+        size_t test;
         if (curr + min_rep < length) {
             test = curr + min_rep;
         } else {
@@ -556,7 +556,7 @@ size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
         const m128 mask1 = kp->u.truffle.mask1;
         const m128 mask2 = kp->u.truffle.mask2;
     truffle_restart:;
-        size_t test = curr;
+        size_t test;
         if (curr + min_rep < length) {
             test = curr + min_rep;
         } else {
@@ -582,7 +582,7 @@ size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
 
     nverm_restart:;
         assert(buf[curr] != kp->u.verm.c);
-        size_t test = curr;
+        size_t test;
         if (curr + min_rep < length) {
             test = curr + min_rep;
         } else {
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index 9a13376d..8286d816 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -165,9 +165,9 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
         return;
     }
 
-    NFAVertex dotV = NGHolder::null_vertex();
+    
     set<NFAVertex> otherV;
-    dotV = findReformable(g, compAnchoredStarts, otherV);
+    NFAVertex dotV = findReformable(g, compAnchoredStarts, otherV);
     if (dotV == NGHolder::null_vertex()) {
         DEBUG_PRINTF("no candidate reformable dot found.\n");
         return;
@@ -268,9 +268,9 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
     }
 
     while (true) {
-        NFAVertex dotV = NGHolder::null_vertex();
+        
         set<NFAVertex> otherV;
-        dotV = findReformable(g, compUnanchoredStarts, otherV);
+        NFAVertex dotV = findReformable(g, compUnanchoredStarts, otherV);
         if (dotV == NGHolder::null_vertex()) {
             DEBUG_PRINTF("no candidate reformable dot found.\n");
             return;

From 72371a05dca2d3a768429461fd7af4a618c554b2 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 13:15:17 +0300
Subject: [PATCH 38/56] derefInvalidIteratorRedundantCheck

---
 src/nfagraph/ng_som.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 359fa17b..0e42b4b5 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -1292,8 +1292,8 @@ bool doTreePlanningIntl(NGHolder &g,
         DEBUG_PRINTF("add mapped reporters for region %u\n", it->first);
         addMappedReporterVertices(it->second, g, copy_to_orig,
                                   plan.back().reporters);
-    } while (it->second.optional && it != info.rend() &&
-             (++it)->first > furthest->first);
+    } while (it != info.rend() && it->second.optional && 
+            (++it)->first > furthest->first);
 
     return true;
 }
@@ -1551,7 +1551,7 @@ bool doSomPlanning(NGHolder &g, bool stuck_in,
         DEBUG_PRINTF("region %u contributes reporters to last plan\n",
                      it->first);
         addReporterVertices(it->second, g, plan.back().reporters);
-    } while (it->second.optional && it != info.rend() &&
+    } while (it != info.rend() && it->second.optional &&
              (++it)->first > furthest->first);
 
     DEBUG_PRINTF("done!\n");

From 5dab841cea8809a64686fe75503439cb4096bcc0 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 24 Apr 2024 15:50:55 +0300
Subject: [PATCH 39/56] badBitmaskCheck

---
 src/database.h                       |  9 +++------
 src/fdr/teddy_engine_description.cpp | 16 ++++++++--------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/database.h b/src/database.h
index a4d6e4dc..1b94f1b0 100644
--- a/src/database.h
+++ b/src/database.h
@@ -79,21 +79,18 @@ static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
     HS_PLATFORM_NOAVX2 |
     HS_PLATFORM_NOAVX512 |
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI 
 };
 
 static UNUSED
 const platform_t hs_current_platform_no_avx512 = {
     HS_PLATFORM_NOAVX512 |
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI
 };
 
 static UNUSED
 const platform_t hs_current_platform_no_avx512vbmi = {
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI 
 };
 
 /*
diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp
index 7cd33ab2..5f05a055 100644
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -52,14 +52,14 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
 
 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
     static const TeddyEngineDef defns[] = {
-        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
-        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
-        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
-        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
-        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
-        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
-        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
-        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
+        { 3, HS_CPU_FEATURES_AVX2, 1, 16, false },
+        { 4, HS_CPU_FEATURES_AVX2, 1, 16, true },
+        { 5, HS_CPU_FEATURES_AVX2, 2, 16, false },
+        { 6, HS_CPU_FEATURES_AVX2, 2, 16, true },
+        { 7, HS_CPU_FEATURES_AVX2, 3, 16, false },
+        { 8, HS_CPU_FEATURES_AVX2, 3, 16, true },
+        { 9, HS_CPU_FEATURES_AVX2, 4, 16, false },
+        { 10, HS_CPU_FEATURES_AVX2, 4, 16, true },
         { 11, 0, 1, 8, false },
         { 12, 0, 1, 8, true },
         { 13, 0, 2, 8, false },

From 11e49683672f489d2515344696593c1a6b5bb6ee Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 24 Apr 2024 15:55:57 +0300
Subject: [PATCH 40/56] cppcheck invalidPrintfArgType_uint warnings

---
 benchmarks/benchmarks.cpp          | 4 ++--
 tools/hsbench/engine_hyperscan.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index d8b77992..5dc08ee4 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -102,7 +102,7 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
         avg_bw /= max_matches;
 	total_sec /= 1000000.0;
         /*convert average time to us*/
-        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
+        printf(KMAG "%s: %d matches, %d * %d iterations," KBLU " total elapsed time =" RST " %.3f s, " 
                KBLU "average time per call =" RST " %.3f μs," KBLU " max bandwidth = " RST " %.3f MB/s," KBLU " average bandwidth =" RST " %.3f MB/s \n",
                bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
     } else {
@@ -122,7 +122,7 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
         max_bw = total_size / total_sec;
         /*convert to MB/s*/
         max_bw /= 1048576.0;
-        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
+        printf(KMAG "%s: no matches, %d * %d iterations," KBLU " total elapsed time =" RST " %.3f s, " 
                KBLU "average time per call =" RST " %.3f μs ," KBLU " bandwidth = " RST " %.3f MB/s \n",
                bench.label, size ,loops, total_sec, avg_time, max_bw );
     }
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 95461de5..268b14d6 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -456,7 +456,7 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
 
         if (err == HS_COMPILER_ERROR) {
             if (compile_err->expression >= 0) {
-                printf("Compile error for signature #%u: %s\n",
+                printf("Compile error for signature #%d: %s\n",
                        compile_err->expression, compile_err->message);
             } else {
                 printf("Compile error: %s\n", compile_err->message);

From e291d498fa3fa7b4ae0a625d49a30a9d93eeee0e Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 24 Apr 2024 16:26:38 +0300
Subject: [PATCH 41/56] fixed merge mixup

---
 benchmarks/benchmarks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index ff6496a6..e48652e9 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -126,7 +126,7 @@ static void run_benchmarks(int size, int loops, int max_matches,
         /*convert to MB/s*/
         max_bw /= 1048576.0;
         printf("%-18s, %-12s, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
-               bench.label, size ,loops, total_sec, avg_time, max_bw );
+               bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
     }
 }
 

From 7fd45f864cefbce9c4ceb72acd9b1321dc4c8f2f Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 24 Apr 2024 17:32:09 +0300
Subject: [PATCH 42/56] next batch for cppeheck, addressing syntaxError and
 constParameterPointer

---
 src/fdr/fdr.c                    | 2 +-
 src/nfa/castle.c                 | 2 +-
 src/nfa/goughcompile.cpp         | 2 +-
 src/nfa/limex_common_impl.h      | 4 ++--
 src/nfa/limex_runtime_impl.h     | 2 +-
 src/nfa/mpv.c                    | 2 +-
 src/rose/rose_build_matchers.cpp | 2 +-
 src/rose/stream.c                | 2 +-
 src/scratch.h                    | 4 ++--
 src/som/som_runtime.c            | 6 +++---
 tools/hsbench/main.cpp           | 4 ++--
 unit/hyperscan/test_util.cpp     | 2 +-
 12 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 561e8f98..16f453c5 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -298,7 +298,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                     const u32 *confBase, const struct FDR_Runtime_Args *a,
-                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
+                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
     const u8 bucket = 8;
 
     if (likely(!*conf)) {
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 29208f8d..4c103636 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -400,7 +400,7 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
 }
 
 static really_inline
-u64a subCastleNextMatch(const struct Castle *c, void *full_state,
+u64a subCastleNextMatch(const struct Castle *c, const void *full_state,
                         void *stream_state, const u64a loc,
                         const u32 subIdx) {
     DEBUG_PRINTF("subcastle %u\n", subIdx);
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 59ef052f..d703a32c 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -659,7 +659,7 @@ GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
 }
 
 const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(
-                                                 GoughSSAVar *input) const {
+                                                 const GoughSSAVar *input) const {
     return input_map.at(input);
 }
 
diff --git a/src/nfa/limex_common_impl.h b/src/nfa/limex_common_impl.h
index e441945d..48661871 100644
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@@ -332,7 +332,7 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
 // UE-1636) need to guard cyclic tug-accepts as well.
 static really_inline
 char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
-                       union RepeatControl *repeat_ctrl, char *repeat_state,
+                       const union RepeatControl *repeat_ctrl, const char *repeat_state,
                        u64a offset, ReportID report) {
     assert(limex);
 
@@ -382,7 +382,7 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
 
 static really_inline
 char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
-                          union RepeatControl *repeat_ctrl, char *repeat_state,
+                          const union RepeatControl *repeat_ctrl, const char *repeat_state,
                           u64a offset) {
     assert(limex);
 
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 7b89182b..b282ae18 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -927,7 +927,7 @@ char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
                       context);
 }
 
-char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
+char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, const struct mq *q) {
     const IMPL_NFA_T *limex = getImplNfa(n);
     REPORTCURRENT_FN(limex, q);
     return 1;
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index cba3d159..62bdbdb9 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -607,7 +607,7 @@ size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf,
 }
 
 static really_inline
-void restartKilo(const struct mpv *m, UNUSED u8 *active, u8 *reporters,
+void restartKilo(const struct mpv *m, UNUSED const u8 *active, u8 *reporters,
                  struct mpv_decomp_state *dstate, struct mpv_pq_item *pq,
                  const u8 *buf, u64a prev_limit, size_t buf_length, u32 i) {
     const struct mpv_kilopuff *kp = (const void *)(m + 1);
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index aa7a794d..3d68200f 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -884,7 +884,7 @@ void buildAccel(const RoseBuildImpl &build,
 }
 
 bytecode_ptr<HWLM>
-buildHWLMMatcher(const RoseBuildImpl &build, LitProto *litProto) {
+buildHWLMMatcher(const RoseBuildImpl &build, const LitProto *litProto) {
     if (!litProto) {
         return nullptr;
     }
diff --git a/src/rose/stream.c b/src/rose/stream.c
index 26268dd5..acf4855a 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -104,7 +104,7 @@ void runAnchoredTableStream(const struct RoseEngine *t, const void *atable,
 
 
 static really_inline
-void saveStreamState(const struct NFA *nfa, struct mq *q, s64a loc) {
+void saveStreamState(const struct NFA *nfa, const struct mq *q, s64a loc) {
     DEBUG_PRINTF("offset=%llu, length=%zu, hlength=%zu, loc=%lld\n",
                  q->offset, q->length, q->hlength, loc);
     nfaQueueCompressState(nfa, q, loc);
diff --git a/src/scratch.h b/src/scratch.h
index e3cd9245..e01ccd6b 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -215,12 +215,12 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
 
 /* array of fatbit ptr; TODO: why not an array of fatbits? */
 static really_inline
-struct fatbit **getAnchoredLiteralLog(struct hs_scratch *scratch) {
+struct fatbit **getAnchoredLiteralLog(const struct hs_scratch *scratch) {
     return scratch->al_log;
 }
 
 static really_inline
-struct fatbit **getDelaySlots(struct hs_scratch *scratch) {
+struct fatbit **getDelaySlots(const struct hs_scratch *scratch) {
     return scratch->delay_slots;
 }
 
diff --git a/src/som/som_runtime.c b/src/som/som_runtime.c
index 1a868efc..ce179ca0 100644
--- a/src/som/som_runtime.c
+++ b/src/som/som_runtime.c
@@ -69,8 +69,8 @@ void setSomLoc(struct fatbit *som_set_now, u64a *som_store, u32 som_store_count,
 }
 
 static really_inline
-char ok_and_mark_if_write(u8 *som_store_valid, struct fatbit *som_set_now,
-                          u8 *som_store_writable, u32 som_store_count,
+char ok_and_mark_if_write(u8 *som_store_valid, const struct fatbit *som_set_now,
+                          const u8 *som_store_writable, u32 som_store_count,
                           u32 loc) {
     return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */
         || fatbit_isset(som_set_now, som_store_count, loc) /* write here, need
@@ -79,7 +79,7 @@ char ok_and_mark_if_write(u8 *som_store_valid, struct fatbit *som_set_now,
 }
 
 static really_inline
-char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now,
+char ok_and_mark_if_unset(u8 *som_store_valid, const struct fatbit *som_set_now,
                           u32 som_store_count, u32 loc) {
     return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */
         || fatbit_isset(som_set_now, som_store_count, loc); /* write here, need
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 1a19d510..8d9f8d6d 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -465,7 +465,7 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
 
 /** Start the global timer. */
 static
-void startTotalTimer(ThreadContext *ctx) {
+void startTotalTimer(const ThreadContext *ctx) {
     if (ctx->num != 0) {
         return; // only runs in the first thread
     }
@@ -474,7 +474,7 @@ void startTotalTimer(ThreadContext *ctx) {
 
 /** Stop the global timer and calculate totals. */
 static
-void stopTotalTimer(ThreadContext *ctx) {
+void stopTotalTimer(const ThreadContext *ctx) {
     if (ctx->num != 0) {
         return; // only runs in the first thread
     }
diff --git a/unit/hyperscan/test_util.cpp b/unit/hyperscan/test_util.cpp
index f6c20a74..c7a26acd 100644
--- a/unit/hyperscan/test_util.cpp
+++ b/unit/hyperscan/test_util.cpp
@@ -58,7 +58,7 @@ std::ostream &operator<<(std::ostream &o, const pattern &p) {
 }
 
 hs_database_t *buildDB(const vector<pattern> &patterns, unsigned int mode,
-                       hs_platform_info *plat) {
+                       const hs_platform_info *plat) {
     vector<const char *> expressions;
     vector<unsigned int> flags;
     vector<unsigned int> ids;

From 49fd4f0047d2d04d6ea6edf2b1406fb1685e1c31 Mon Sep 17 00:00:00 2001
From: Yoan Picchi <yoan.picchi@arm.com>
Date: Thu, 4 Apr 2024 09:46:23 +0000
Subject: [PATCH 43/56] Enable sheng32 and sheng64 on Arm

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 src/nfa/sheng.c          |  8 ++---
 src/nfa/sheng.h          |  8 ++---
 src/nfa/sheng_defs.h     | 70 ++++++++++++++++++++--------------------
 src/nfa/sheng_impl.h     | 38 +++++++++++++++++++++-
 src/nfa/sheng_impl4.h    | 61 ++++++++++++++++++++++++++++++++--
 src/nfa/shengcompile.cpp | 14 ++++++++
 6 files changed, 153 insertions(+), 46 deletions(-)

diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index 3f36e218..922e8f80 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 // Sheng32
 static really_inline
 const struct sheng32 *get_sheng32(const struct NFA *n) {
@@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
     }
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || HAVE_SVE
 
 /* include Sheng function definitions */
 #include "sheng_defs.h"
@@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
     return 0;
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 // Sheng32
 static really_inline
 char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
@@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u8 *)dest = *(const u8 *)src;
     return 0;
 }
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || HAVE_SVE
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 7b90e303..212bd3a4 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
@@ -106,8 +106,7 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
 
 char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
                       size_t length, NfaCallback cb, void *context);
-
-#else // !HAVE_AVX512VBMI
+#else // !HAVE_AVX512VBMI && !HAVE_SVE
 
 #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
 #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
@@ -138,6 +137,7 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
 #define nfaExecSheng64_testEOD NFA_API_NO_IMPL
 #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
 #define nfaExecSheng64_B NFA_API_NO_IMPL
-#endif // end of HAVE_AVX512VBMI
+#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE)
+
 
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
index 390af752..886af28e 100644
--- a/src/nfa/sheng_defs.h
+++ b/src/nfa/sheng_defs.h
@@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
     return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 static really_inline
 u8 isDeadState32(const u8 a) {
     return a & SHENG32_STATE_DEAD;
@@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_cod
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_cod
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_co
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_co
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_samd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_samd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 isAcceptState32
@@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_sam
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_sam
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 isAcceptState32
@@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nmd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_nmd
 #define DEAD_FUNC32 isDeadState32
 #define ACCEPT_FUNC32 dummyFunc
@@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_nm
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_nm
 #define DEAD_FUNC32 dummyFunc
 #define ACCEPT_FUNC32 dummyFunc
@@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef DEAD_FUNC32
 #undef ACCEPT_FUNC32
@@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_coda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_cod
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_coa
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_co
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_samda
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_samd
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 isDeadState32
@@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_sama
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_sam
 #define INTERESTING_FUNC32 hasInterestingStates32
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC isAccelState
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nmda
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nmd
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
@@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #define SHENG32_IMPL sheng32_4_nm
 #define INTERESTING_FUNC32 dummyFunc4
 #define INNER_DEAD_FUNC32 dummyFunc
@@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 #undef SHENG32_IMPL
 #undef INTERESTING_FUNC32
 #undef INNER_DEAD_FUNC32
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 1fa5c831..2c701446 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -96,7 +96,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
                   const struct sheng32 *s,
@@ -114,14 +114,28 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
     }
     DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
 
+#if defined(HAVE_SVE)
+    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+    svuint8_t cur_state = svdup_u8(*state);
+    svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F);
+    const m512 *masks = s->succ_masks;
+#else
     m512 cur_state = set1_64x8(*state);
     const m512 *masks = s->succ_masks;
+#endif
 
     while (likely(cur_buf != end)) {
         const u8 c = *cur_buf;
+
+#if defined(HAVE_SVE)
+        svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c));
+        cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 tmp = svlastb(lane_pred_32, cur_state);
+#else
         const m512 succ_mask = masks[c];
         cur_state = vpermb512(cur_state, succ_mask);
         const u8 tmp = movd512(cur_state);
+#endif
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
         DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
@@ -153,7 +167,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         }
         cur_buf++;
     }
+#if defined(HAVE_SVE)
+    *state = svlastb(lane_pred_32, cur_state);
+#else
     *state = movd512(cur_state);
+#endif
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
@@ -175,14 +193,28 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
     }
     DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
 
+#if defined(HAVE_SVE)
+    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+    svuint8_t cur_state = svdup_u8(*state);
+    svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F);
+    const m512 *masks = s->succ_masks;
+#else
     m512 cur_state = set1_64x8(*state);
     const m512 *masks = s->succ_masks;
+#endif
 
     while (likely(cur_buf != end)) {
         const u8 c = *cur_buf;
+
+#if defined(HAVE_SVE)
+        svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c));
+        cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 tmp = svlastb(lane_pred_64, cur_state);
+#else
         const m512 succ_mask = masks[c];
         cur_state = vpermb512(cur_state, succ_mask);
         const u8 tmp = movd512(cur_state);
+#endif
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
         DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
@@ -214,7 +246,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         }
         cur_buf++;
     }
+#if defined(HAVE_SVE)
+    *state = svlastb(lane_pred_64, cur_state);
+#else
     *state = movd512(cur_state);
+#endif
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index e5d3468f..718c3409 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -283,7 +283,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     return MO_CONTINUE_MATCHING;
 }
 
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 static really_inline
 char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
                   const struct sheng32 *s,
@@ -320,8 +320,15 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         return MO_CONTINUE_MATCHING;
     }
 
+#if defined(HAVE_SVE)
+    const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+    svuint8_t cur_state = svdup_u8(*state);
+    svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F);
+    const m512 *masks = s->succ_masks;
+#else
     m512 cur_state = set1_64x8(*state);
     const m512 *masks = s->succ_masks;
+#endif
 
     while (likely(end - cur_buf >= 4)) {
         const u8 *b1 = cur_buf;
@@ -333,6 +340,23 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         const u8 c3 = *b3;
         const u8 c4 = *b4;
 
+#if defined(HAVE_SVE)
+        svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1));
+        cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a1 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2));
+        cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a2 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3));
+        cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a3 = svlastb(lane_pred_32, cur_state);
+
+        svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4));
+        cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a4 = svlastb(lane_pred_32, cur_state);
+#else
         const m512 succ_mask1 = masks[c1];
         cur_state = vpermb512(cur_state, succ_mask1);
         const u8 a1 = movd512(cur_state);
@@ -348,6 +372,7 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         const m512 succ_mask4 = masks[c4];
         cur_state = vpermb512(cur_state, succ_mask4);
         const u8 a4 = movd512(cur_state);
+#endif
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
         DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
@@ -517,7 +542,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         };
         cur_buf += 4;
     }
+#if defined(HAVE_SVE)
+    *state = svlastb(lane_pred_32, cur_state);
+#else
     *state = movd512(cur_state);
+#endif
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
@@ -541,9 +570,15 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         *scan_end = end;
         return MO_CONTINUE_MATCHING;
     }
-
+#if defined(HAVE_SVE)
+    const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+    svuint8_t cur_state = svdup_u8(*state);
+    svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F);
+    const m512 *masks = s->succ_masks;
+#else
     m512 cur_state = set1_64x8(*state);
     const m512 *masks = s->succ_masks;
+#endif
 
     while (likely(end - cur_buf >= 4)) {
         const u8 *b1 = cur_buf;
@@ -555,6 +590,23 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         const u8 c3 = *b3;
         const u8 c4 = *b4;
 
+#if defined(HAVE_SVE)
+        svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1));
+        cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a1 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2));
+        cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a2 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3));
+        cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a3 = svlastb(lane_pred_64, cur_state);
+
+        svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4));
+        cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state));
+        const u8 a4 = svlastb(lane_pred_64, cur_state);
+#else
         const m512 succ_mask1 = masks[c1];
         cur_state = vpermb512(cur_state, succ_mask1);
         const u8 a1 = movd512(cur_state);
@@ -570,6 +622,7 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         const m512 succ_mask4 = masks[c4];
         cur_state = vpermb512(cur_state, succ_mask4);
         const u8 a4 = movd512(cur_state);
+#endif
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
         DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
@@ -703,7 +756,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
         }
         cur_buf += 4;
     }
+#if defined(HAVE_SVE)
+    *state = svlastb(lane_pred_64, cur_state);
+#else
     *state = movd512(cur_state);
+#endif
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 055e1971..0f93e139 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -730,10 +730,17 @@ bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
+#ifdef HAVE_SVE
+    if (svcntb()<32) {
+        DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n");
+        return nullptr;
+    }
+#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
+#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);
@@ -762,10 +769,17 @@ bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
+#ifdef HAVE_SVE
+    if (svcntb()<64) {
+        DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n");
+        return nullptr;
+    }
+#else
     if (!cc.target_info.has_avx512vbmi()) {
         DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
         return nullptr;
     }
+#endif
 
     sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);

From 62e3450eaed93f078a506a1ce19640e7c2a3d931 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Mon, 29 Apr 2024 10:26:39 +0300
Subject: [PATCH 44/56] missingOverride

---
 tools/hsbench/engine_chimera.h   | 20 ++++++++++----------
 tools/hsbench/engine_hyperscan.h | 20 ++++++++++----------
 tools/hsbench/engine_pcre.h      | 20 ++++++++++----------
 util/ng_corpus_generator.cpp     |  4 ++--
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/tools/hsbench/engine_chimera.h b/tools/hsbench/engine_chimera.h
index 187dec8c..52ec1179 100644
--- a/tools/hsbench/engine_chimera.h
+++ b/tools/hsbench/engine_chimera.h
@@ -66,32 +66,32 @@ public:
     explicit EngineChimera(ch_database_t *db, CompileCHStats cs);
     ~EngineChimera();
 
-    std::unique_ptr<EngineContext> makeContext() const;
+    std::unique_ptr<EngineContext> makeContext() const override;
 
     void scan(const char *data, unsigned int len, unsigned int id,
-              ResultEntry &result, EngineContext &ectx) const;
+              ResultEntry &result, EngineContext &ectx) const override;
 
     void scan_vectored(const char *const *data, const unsigned int *len,
                        unsigned int count, unsigned int streamId,
-                       ResultEntry &result, EngineContext &ectx) const;
+                       ResultEntry &result, EngineContext &ectx) const override;
 
     std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
-                                             unsigned id) const;
+                                             unsigned id) const override;
 
     void streamClose(std::unique_ptr<EngineStream> stream,
-                     ResultEntry &result) const;
+                     ResultEntry &result) const override;
 
     void streamCompressExpand(EngineStream &stream,
-                              std::vector<char> &temp) const;
+                              std::vector<char> &temp) const override;
 
     void streamScan(EngineStream &stream, const char *data, unsigned int len,
-                    unsigned int id, ResultEntry &result) const;
+                    unsigned int id, ResultEntry &result) const override;
 
-    void printStats() const;
+    void printStats() const override;
 
-    void printCsvStats() const;
+    void printCsvStats() const override;
 
-    void sqlStats(SqlDB &db) const;
+    void sqlStats(SqlDB &db) const override;
 
 private:
     ch_database_t *db;
diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h
index afbdf098..ccbc2fa8 100644
--- a/tools/hsbench/engine_hyperscan.h
+++ b/tools/hsbench/engine_hyperscan.h
@@ -75,32 +75,32 @@ public:
     explicit EngineHyperscan(hs_database_t *db, CompileHSStats cs);
     ~EngineHyperscan();
 
-    std::unique_ptr<EngineContext> makeContext() const;
+    std::unique_ptr<EngineContext> makeContext() const override;
 
     void scan(const char *data, unsigned int len, unsigned int id,
-              ResultEntry &result, EngineContext &ectx) const;
+              ResultEntry &result, EngineContext &ectx) const override;
 
     void scan_vectored(const char *const *data, const unsigned int *len,
                        unsigned int count, unsigned int streamId,
-                       ResultEntry &result, EngineContext &ectx) const;
+                       ResultEntry &result, EngineContext &ectx) const override;
 
     std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
-                                             unsigned id) const;
+                                             unsigned id) const override;
 
     void streamClose(std::unique_ptr<EngineStream> stream,
-                     ResultEntry &result) const;
+                     ResultEntry &result) const override;
 
     void streamCompressExpand(EngineStream &stream,
-                              std::vector<char> &temp) const;
+                              std::vector<char> &temp) const override;
 
     void streamScan(EngineStream &stream, const char *data, unsigned int len,
-                    unsigned int id, ResultEntry &result) const;
+                    unsigned int id, ResultEntry &result) const override;
 
-    void printStats() const;
+    void printStats() const override;
 
-    void printCsvStats() const;
+    void printCsvStats() const override;
 
-    void sqlStats(SqlDB &db) const;
+    void sqlStats(SqlDB &db) const override;
 
 private:
     hs_database_t *db;
diff --git a/tools/hsbench/engine_pcre.h b/tools/hsbench/engine_pcre.h
index 9569bef4..7ae9147f 100644
--- a/tools/hsbench/engine_pcre.h
+++ b/tools/hsbench/engine_pcre.h
@@ -74,32 +74,32 @@ public:
                         CompilePCREStats cs, int capture_cnt_in);
     ~EnginePCRE();
 
-    std::unique_ptr<EngineContext> makeContext() const;
+    std::unique_ptr<EngineContext> makeContext() const override;
 
     void scan(const char *data, unsigned int len, unsigned int id,
-              ResultEntry &result, EngineContext &ectx) const;
+              ResultEntry &result, EngineContext &ectx) const override;
 
     void scan_vectored(const char *const *data, const unsigned int *len,
                        unsigned int count, unsigned int streamId,
-                       ResultEntry &result, EngineContext &ectx) const;
+                       ResultEntry &result, EngineContext &ectx) const override;
 
     std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
-                                             unsigned id) const;
+                                             unsigned id) const override;
 
     void streamClose(std::unique_ptr<EngineStream> stream,
-                     ResultEntry &result) const;
+                     ResultEntry &result) const override;
 
     void streamCompressExpand(EngineStream &stream,
-                              std::vector<char> &temp) const;
+                              std::vector<char> &temp) const override;
 
     void streamScan(EngineStream &stream, const char *data, unsigned int len,
-                    unsigned int id, ResultEntry &result) const;
+                    unsigned int id, ResultEntry &result) const override;
 
-    void printStats() const;
+    void printStats() const override;
 
-    void printCsvStats() const;
+    void printCsvStats() const override;
 
-    void sqlStats(SqlDB &db) const;
+    void sqlStats(SqlDB &db) const override;
 
 private:
     std::vector<std::unique_ptr<PcreDB>> dbs;
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 6c3f613d..68aa5583 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -223,7 +223,7 @@ public:
                         CorpusProperties &props);
     ~CorpusGeneratorImpl() = default;
 
-    void generateCorpus(vector<string> &data);
+    void generateCorpus(vector<string> &data) override;
 
 private:
     unsigned char getRandomChar();
@@ -419,7 +419,7 @@ public:
                         CorpusProperties &props);
     ~CorpusGeneratorUtf8() = default;
 
-    void generateCorpus(vector<string> &data);
+    void generateCorpus(vector<string> &data) override;
 
 private:
     unichar getRandomChar();

From f463357a380ee0b5d9177d92561162a2b95b8686 Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Mon, 29 Apr 2024 12:39:28 +0300
Subject: [PATCH 45/56] ome of the consts couldnt be propagated and had to be
 reverted

---
 src/nfa/goughcompile.cpp     | 2 +-
 src/nfa/limex_runtime_impl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index d703a32c..59ef052f 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -659,7 +659,7 @@ GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
 }
 
 const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(
-                                                 const GoughSSAVar *input) const {
+                                                 GoughSSAVar *input) const {
     return input_map.at(input);
 }
 
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index b282ae18..7b89182b 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -927,7 +927,7 @@ char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
                       context);
 }
 
-char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, const struct mq *q) {
+char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
     const IMPL_NFA_T *limex = getImplNfa(n);
     REPORTCURRENT_FN(limex, q);
     return 1;

From 987cd17160f962e1e1a36b9a49ec0c2ac4d8bdf5 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Mon, 29 Apr 2024 13:13:07 +0300
Subject: [PATCH 46/56] variableScope

---
 benchmarks/benchmarks.cpp           |  7 +++----
 examples/patbench.cc                |  3 ++-
 src/crc32.c                         |  4 ++--
 src/nfa/castle.c                    |  3 +--
 src/nfa/repeatcompile.cpp           |  9 +++------
 src/nfagraph/ng_haig.cpp            |  4 ++--
 src/nfagraph/ng_redundancy.cpp      |  4 ++--
 src/nfagraph/ng_violet.cpp          |  7 +++----
 src/parser/ComponentAlternation.cpp |  8 ++++----
 src/parser/ComponentSequence.cpp    |  8 ++++----
 src/parser/logical_combination.cpp  |  4 ++--
 src/rose/rose_build_add_mask.cpp    |  3 ++-
 src/rose/rose_build_bytecode.cpp    |  3 +--
 src/util/arch/common/bitutils.h     | 18 +++++++++---------
 unit/internal/repeat.cpp            |  9 +++------
 unit/internal/rose_mask.cpp         |  3 +--
 unit/internal/rose_mask_32.cpp      |  3 +--
 17 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index e48652e9..3c1a0bab 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -63,12 +63,10 @@ static void run_benchmarks(int size, int loops, int max_matches,
                            InitFunc &&init, BenchFunc &&func) {
     init(bench);
     double total_sec = 0.0;
-    u64a total_size = 0;
-    double bw = 0.0;
-    double avg_bw = 0.0;
     double max_bw = 0.0;
     double avg_time = 0.0;
     if (max_matches) {
+        double avg_bw = 0.0;
         int pos = 0;
         for (int j = 0; j < max_matches - 1; j++) {
             bench.buf[pos] = 'b';
@@ -90,7 +88,7 @@ static void run_benchmarks(int size, int loops, int max_matches,
             total_sec += dt;
             /*convert microseconds to seconds*/
             /*calculate bandwidth*/
-            bw = (actual_size / dt) * 1000000.0 / 1048576.0;
+            double bw = (actual_size / dt) * 1000000.0 / 1048576.0;
             /*std::cout << "act_size = " << act_size << std::endl;
             std::cout << "dt = " << dt << std::endl;
             std::cout << "bw = " << bw << std::endl;*/
@@ -107,6 +105,7 @@ static void run_benchmarks(int size, int loops, int max_matches,
         printf("%-18s, %-12d, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
                bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
     } else {
+        u64a total_size = 0;
         auto start = std::chrono::steady_clock::now();
         for (int i = 0; i < loops; i++) {
             const u8 *res = func(bench);
diff --git a/examples/patbench.cc b/examples/patbench.cc
index 1f965f13..b1ec2766 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -605,8 +605,9 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
         scan_time = measure_stream_time(bench, repeatCount);
     }
     size_t bytes = bench.bytes();
-    size_t matches = bench.matches();
+    
     if (diagnose) {
+        size_t matches = bench.matches();
         std::ios::fmtflags f(cout.flags());
         cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
              << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
diff --git a/src/crc32.c b/src/crc32.c
index 19c7b7fa..c2f15119 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -547,9 +547,9 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
         u32 block = *(const u32 *)p_buf;
         crc ^= block;
         p_buf += 4;
-        term1 = crc_tableil8_o88[crc & 0x000000FF] ^
+        u32 term1 = crc_tableil8_o88[crc & 0x000000FF] ^
                 crc_tableil8_o80[(crc >> 8) & 0x000000FF];
-        term2 = crc >> 16;
+        u32 term2 = crc >> 16;
         crc = term1 ^
               crc_tableil8_o72[term2 & 0x000000FF] ^
               crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 29208f8d..128f3489 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -489,7 +489,6 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
         // full_state (scratch).
 
         u64a offset = end; // min offset of next match
-        u32 activeIdx = 0;
         mmbit_clear(matching, c->numRepeats);
         if (c->exclusive) {
             u8 *active = (u8 *)stream_state;
@@ -497,7 +496,7 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
             for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
                  i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
                 u8 *cur = active + i * c->activeIdxSize;
-                activeIdx = partial_load_u32(cur, c->activeIdxSize);
+                u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
                 u64a match = subCastleNextMatch(c, full_state, stream_state,
                                                 loc, activeIdx);
                 set_matching(c, match, groups, matching, c->numGroups, i,
diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 60b51352..d9544675 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -94,9 +94,6 @@ u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
 static
 u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
                          const u32 minPeriod, u32 rv) {
-    u32 cnt = 0;
-    u32 patch_bits = 0;
-    u32 total_size = 0;
     u32 min = ~0U;
     u32 patch_len = 0;
 
@@ -105,11 +102,11 @@ u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
     }
 
     for (u32 i = minPeriod; i <= rv; i++) {
-        cnt = ((u32)repeatMax + (i - 1)) / i + 1;
+        u32 cnt = ((u32)repeatMax + (i - 1)) / i + 1;
 
         // no bit packing version
-        patch_bits = calcPackedBits(info->table[i]);
-        total_size = (patch_bits + 7U) / 8U * cnt;
+        u32 patch_bits = calcPackedBits(info->table[i]);
+        u32 total_size = (patch_bits + 7U) / 8U * cnt;
 
         if (total_size < min) {
             patch_len = i;
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index bf951a0b..ede0f79b 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -513,12 +513,12 @@ static
 bool doHaig(const NGHolder &g, som_type som,
             const vector<vector<CharReach>> &triggers, bool unordered_som,
             raw_som_dfa *rdfa) {
-    u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
-                                                     a fight */
     using StateSet = typename Auto::StateSet;
     vector<StateSet> nfa_state_map;
     Auto n(g, som, triggers, unordered_som);
     try {
+        u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
+                                                     a fight */
         if (!determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return false;
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index a499a40d..baabf285 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -636,12 +636,12 @@ bool reversePathReachSubset(const NFAEdge &e, const NFAVertex &dom,
 
     NFAVertex start = source(e, g);
     using RevGraph = boost::reverse_graph<NGHolder, const NGHolder &>;
-    map<RevGraph::vertex_descriptor, boost::default_color_type> vertexColor;
 
     // Walk the graph backwards from v, examining each node. We fail (return
     // false) if we encounter a node with reach NOT a subset of domReach, and
     // we stop searching at dom.
     try {
+        map<RevGraph::vertex_descriptor, boost::default_color_type> vertexColor;
         depth_first_visit(RevGraph(g), start,
                           ReachSubsetVisitor(domReach),
                           make_assoc_property_map(vertexColor),
@@ -664,12 +664,12 @@ bool forwardPathReachSubset(const NFAEdge &e, const NFAVertex &dom,
     }
 
     NFAVertex start = target(e, g);
-    map<NFAVertex, boost::default_color_type> vertexColor;
 
     // Walk the graph forward from v, examining each node. We fail (return
     // false) if we encounter a node with reach NOT a subset of domReach, and
     // we stop searching at dom.
     try {
+        map<NFAVertex, boost::default_color_type> vertexColor;
         depth_first_visit(g, start, ReachSubsetVisitor(domReach),
                           make_assoc_property_map(vertexColor),
                           VertexIs<NGHolder, NFAVertex>(dom));
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 02461e98..b39cfba2 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -348,10 +348,9 @@ void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
 
     map<NFAVertex, u64a> scores;
     map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
-    set<ue2_literal> s;
 
     for (auto v : a_dom) {
-        s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
+        set<ue2_literal> s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
                                           revisits to the target vertex */
 
         if (s.empty()) {
@@ -2868,7 +2867,6 @@ static
 bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
                               const vector<RoseInEdge> &edges,
                               const CompileContext &cc) {
-    vector<pair<ue2_literal, u32>> succ_lits;
     DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
                   to_string(h.kind).c_str(), num_vertices(h), edges.size());
 
@@ -2877,6 +2875,7 @@ bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
     }
 
     if (!generates_callbacks(h)) {
+        vector<pair<ue2_literal, u32>> succ_lits;
         for (const auto &e : edges) {
             const auto &lit = vg[target(e, vg)].s;
             u32 delay = vg[e].graph_lag;
@@ -2889,8 +2888,8 @@ bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
     }
 
     unique_ptr<VertLitInfo> split;
-    bool last_chance = true;
     if (h.kind == NFA_PREFIX) {
+        bool last_chance = true;
         auto depths = calcDepths(h);
 
         split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc);
diff --git a/src/parser/ComponentAlternation.cpp b/src/parser/ComponentAlternation.cpp
index e38c9ce7..c4bad672 100644
--- a/src/parser/ComponentAlternation.cpp
+++ b/src/parser/ComponentAlternation.cpp
@@ -109,20 +109,20 @@ void ComponentAlternation::append(unique_ptr<Component> component) {
 vector<PositionInfo> ComponentAlternation::first() const {
     // firsts come from all our subcomponents in position order. This will
     // maintain left-to-right priority order.
-    vector<PositionInfo> firsts, subfirsts;
+    vector<PositionInfo> firsts;
 
     for (const auto &c : children) {
-        subfirsts = c->first();
+         vector<PositionInfo> subfirsts = c->first();
         firsts.insert(firsts.end(), subfirsts.begin(), subfirsts.end());
     }
     return firsts;
 }
 
 vector<PositionInfo> ComponentAlternation::last() const {
-    vector<PositionInfo> lasts, sublasts;
+    vector<PositionInfo> lasts;
 
     for (const auto &c : children) {
-        sublasts = c->last();
+        vector<PositionInfo> sublasts = c->last();
         lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
     }
     return lasts;
diff --git a/src/parser/ComponentSequence.cpp b/src/parser/ComponentSequence.cpp
index 2b78177b..f5200206 100644
--- a/src/parser/ComponentSequence.cpp
+++ b/src/parser/ComponentSequence.cpp
@@ -157,10 +157,10 @@ void ComponentSequence::finalize() {
 }
 
 vector<PositionInfo> ComponentSequence::first() const {
-    vector<PositionInfo> firsts, subfirsts;
+    vector<PositionInfo> firsts;
 
     for (const auto &c : children) {
-        subfirsts = c->first();
+        vector<PositionInfo> subfirsts = c->first();
         replaceEpsilons(firsts, subfirsts);
         if (!c->empty()) {
             break;
@@ -229,12 +229,12 @@ void applyEpsilonVisits(vector<PositionInfo> &lasts,
 }
 
 vector<PositionInfo> ComponentSequence::last() const {
-    vector<PositionInfo> lasts, sublasts;
+    vector<PositionInfo> lasts;
     vector<eps_info> visits(1);
 
     auto i = children.rbegin(), e = children.rend();
     for (; i != e; ++i) {
-        sublasts = (*i)->last();
+        vector<PositionInfo> sublasts = (*i)->last();
         applyEpsilonVisits(sublasts, visits);
         lasts.insert(lasts.end(), sublasts.begin(), sublasts.end());
         if ((*i)->empty()) {
diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp
index a37f4e5f..bbe41b83 100644
--- a/src/parser/logical_combination.cpp
+++ b/src/parser/logical_combination.cpp
@@ -260,14 +260,14 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical,
                                             u32 ekey, u64a min_offset,
                                             u64a max_offset) {
     u32 ckey = getCombKey(id);
-    vector<LogicalOperator> op_stack;
     vector<u32> subid_stack;
     u32 lkey_start = INVALID_LKEY; // logical operation's lkey
-    u32 paren = 0; // parentheses
     u32 digit = (u32)-1; // digit start offset, invalid offset is -1
     u32 subid = (u32)-1;
     u32 i;
     try {
+        vector<LogicalOperator> op_stack;
+        u32 paren = 0; // parentheses
         for (i = 0; logical[i]; i++) {
             if (isdigit(logical[i])) {
                 if (digit == (u32)-1) { // new digit start
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index c3736f62..7e0e00b0 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -393,8 +393,9 @@ bool validateTransientMask(const vector<CharReach> &mask, bool anchored,
            none_of(begin(lits), end(lits), mixed_sensitivity));
 
     // Build the HWLM literal mask.
-    vector<u8> msk, cmp;
+    vector<u8> msk;
     if (grey.roseHamsterMasks) {
+        vector<u8> cmp;
         buildLiteralMask(mask, msk, cmp, delay);
     }
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 2df3b3a3..639edddc 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2251,10 +2251,9 @@ vector<u32> buildSuffixEkeyLists(const RoseBuildImpl &build, build_context &bc,
 
     /* for each outfix also build elists */
     for (const auto &outfix : build.outfixes) {
-        u32 qi = outfix.get_queue();
         set<u32> ekeys = reportsToEkeys(all_reports(outfix), build.rm);
-
         if (!ekeys.empty()) {
+            u32 qi = outfix.get_queue();
             qi_to_ekeys[qi] = {ekeys.begin(), ekeys.end()};
         }
     }
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index e5ab0d05..897c9197 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -155,13 +155,13 @@ u32 compress32_impl_c(u32 x, u32 m) {
         return 0;
     }
 
-    u32 mk, mp, mv, t;
+    u32 mk, mv;
 
     x &= m; // clear irrelevant bits
 
     mk = ~m << 1; // we will count 0's to right
     for (u32 i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1);
+        u32 mp = mk ^ (mk << 1);
         mp ^= mp << 2;
         mp ^= mp << 4;
         mp ^= mp << 8;
@@ -169,7 +169,7 @@ u32 compress32_impl_c(u32 x, u32 m) {
 
         mv = mp & m; // bits to move
         m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        t = x & mv;
+        u32 t = x & mv;
         x = (x ^ t) | (t >> (1 << i)); // compress x
         mk = mk & ~mp;
     }
@@ -239,14 +239,14 @@ u32 expand32_impl_c(u32 x, u32 m) {
         return 0;
     }
 
-    u32 m0, mk, mp, mv, t;
+    u32 m0, mk, mv;
     u32 array[5];
 
     m0 = m; // save original mask
     mk = ~m << 1; // we will count 0's to right
 
     for (int i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1); // parallel suffix
+        u32 mp = mk ^ (mk << 1); // parallel suffix
         mp = mp ^ (mp << 2);
         mp = mp ^ (mp << 4);
         mp = mp ^ (mp << 8);
@@ -259,7 +259,7 @@ u32 expand32_impl_c(u32 x, u32 m) {
 
     for (int i = 4; i >= 0; i--) {
         mv = array[i];
-        t = x << (1 << i);
+        u32 t = x << (1 << i);
         x = (x & ~mv) | (t & mv);
     }
 
@@ -409,7 +409,7 @@ u64a pdep64_impl_c(u64a x, u64a _m) {
     u64a result = 0x0UL;
     const u64a mask = 0x8000000000000000UL;
     u64a m = _m;
-    u64a c, t;
+    
     u64a p;
 
     /* The pop-count of the mask gives the number of the bits from
@@ -421,8 +421,8 @@ u64a pdep64_impl_c(u64a x, u64a _m) {
      each mask bit as it is processed.  */
     while (m != 0)
     {
-        c = __builtin_clzl (m);
-        t = x << (p - c);
+        u64a c = __builtin_clzl (m);
+        u64a t = x << (p - c);
         m ^= (mask >> c);
         result |= (t & (mask >> c));
         p++;
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index 5665a0c3..41a54c5f 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -277,10 +277,9 @@ TEST_P(RepeatTest, FillRing) {
     }
 
     // We should be able to see matches for all of these (beyond the last top offset).
-    enum TriggerResult rv;
     for (u64a i = offset + info.repeatMax;
             i <= offset + info.repeatMax + info.repeatMin; i++) {
-        rv = processTugTrigger(&info, ctrl, state, i);
+        enum TriggerResult rv = processTugTrigger(&info, ctrl, state, i);
         if (rv == TRIGGER_SUCCESS_CACHE) {
             rv = TRIGGER_SUCCESS;
         }
@@ -998,16 +997,14 @@ TEST_P(SparseOptimalTest, FillTops) {
     repeatStore(info, ctrl, state, offset, 0);
     ASSERT_EQ(offset, repeatLastTop(info, ctrl, state));
 
-    u64a offset2;
     for (u32 i = min_period; i < patch_count * patch_size; i += min_period) {
-        offset2 = offset + i;
+        u64a offset2 = offset + i;
         repeatStore(info, ctrl, state, offset2, 1);
         ASSERT_EQ(offset2, repeatLastTop(info, ctrl, state));
     }
 
-    u64a exit2;
     for (u32 i = 0; i < patch_count * patch_size; i += min_period) {
-        exit2 = exit + i;
+        u64a exit2 = exit + i;
         for (u32 j = exit2 + info->repeatMin;
              j <= offset + info->repeatMax; j++) {
             ASSERT_EQ(REPEAT_MATCH, repeatHasMatch(info, ctrl, state, j));
diff --git a/unit/internal/rose_mask.cpp b/unit/internal/rose_mask.cpp
index e6be00f3..ed1af956 100644
--- a/unit/internal/rose_mask.cpp
+++ b/unit/internal/rose_mask.cpp
@@ -87,12 +87,11 @@ static int initLegalValidMasks(u64a validMasks[]) {
  */
 static int initLegalNegMasks(u64a negMasks[]) {
     u64a data = 0;
-    u64a offset;
     int num = 0;
     while (data != ONES64) {
         negMasks[num] = data;
         num++;
-        offset = (data | (data +1)) ^ data;
+        u64a offset = (data | (data +1)) ^ data;
         data += 0xfeULL * offset + 1;
     }
     negMasks[num] = data;
diff --git a/unit/internal/rose_mask_32.cpp b/unit/internal/rose_mask_32.cpp
index 732f51a0..bb444ead 100644
--- a/unit/internal/rose_mask_32.cpp
+++ b/unit/internal/rose_mask_32.cpp
@@ -194,10 +194,9 @@ TEST(ValidateMask32, testMask32_3) {
             u32 valid_mask = ONES32 << (left + right) >> left;
             for (int i = 0; i < test_len; i++) {
                 const auto &t = testBasic[i];
-                int bool_result;
                 for (int j = 0; j < 5000; j++) {
                     u32 neg_mask = neg_mask_rand.Generate(1u << 31);
-                    bool_result = (neg_mask & valid_mask) ==
+                    int bool_result = (neg_mask & valid_mask) ==
                                   (t.neg_mask & valid_mask);
                     EXPECT_EQ(bool_result, validateMask32(t.data.a256,
                                                           valid_mask,

From ec8cda3f49abb45be49749146e498510554b231e Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Mon, 29 Apr 2024 13:28:16 +0300
Subject: [PATCH 47/56] variableScopeFix

---
 src/crc32.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/crc32.c b/src/crc32.c
index c2f15119..ca5b5fed 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -542,7 +542,6 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
 
     // Main aligned loop, processes eight bytes at a time.
 
-    u32 term1, term2;
     for (size_t li = 0; li < running_length/8; li++) {
         u32 block = *(const u32 *)p_buf;
         crc ^= block;

From 2a476df2c51f182bf98bd0ed17e6f0c5a3706d93 Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Mon, 29 Apr 2024 13:38:35 +0300
Subject: [PATCH 48/56] fixed const adjustments.

---
 src/rose/rose_build_matchers.h | 2 +-
 unit/hyperscan/test_util.cpp   | 2 +-
 unit/hyperscan/test_util.h     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index ef8999ed..37a96c7a 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -101,7 +101,7 @@ struct LitProto {
 };
 
 bytecode_ptr<HWLM>
-buildHWLMMatcher(const RoseBuildImpl &build, LitProto *proto);
+buildHWLMMatcher(const RoseBuildImpl &build, const LitProto *proto);
 
 std::unique_ptr<LitProto>
 buildFloatingMatcherProto(const RoseBuildImpl &build,
diff --git a/unit/hyperscan/test_util.cpp b/unit/hyperscan/test_util.cpp
index c7a26acd..10d23962 100644
--- a/unit/hyperscan/test_util.cpp
+++ b/unit/hyperscan/test_util.cpp
@@ -92,7 +92,7 @@ hs_database_t *buildDB(const pattern &expr, unsigned int mode) {
 
 hs_database_t *buildDB(const char *expression, unsigned int flags,
                        unsigned int id, unsigned int mode,
-                       hs_platform_info_t *plat) {
+                       const hs_platform_info_t *plat) {
     return buildDB({pattern(expression, flags, id)}, mode, plat);
 }
 
diff --git a/unit/hyperscan/test_util.h b/unit/hyperscan/test_util.h
index 21862b6b..01fdca0b 100644
--- a/unit/hyperscan/test_util.h
+++ b/unit/hyperscan/test_util.h
@@ -99,11 +99,11 @@ struct pattern {
 std::ostream &operator<<(std::ostream &o, const pattern &p);
 
 hs_database_t *buildDB(const std::vector<pattern> &patterns, unsigned int mode,
-                       hs_platform_info *plat = nullptr);
+                       const hs_platform_info *plat = nullptr);
 hs_database_t *buildDB(const pattern &pat, unsigned int mode);
 hs_database_t *buildDB(const char *expression, unsigned int flags,
                        unsigned int id, unsigned int mode,
-                       hs_platform_info *plat = nullptr);
+                       const hs_platform_info *plat = nullptr);
 hs_database_t *buildDB(const char *filename, unsigned int mode,
                        unsigned int extra_flags = 0);
 hs_database_t *buildDB(const char *filename, unsigned int mode,

From bb6464431f8a8c5e290fe472d5344da37952a1c1 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Mon, 29 Apr 2024 15:09:55 +0300
Subject: [PATCH 49/56] new variableScope

---
 src/rose/rose_build_misc.cpp | 2 +-
 src/util/dump_charclass.cpp  | 4 ++--
 unit/internal/fdr_flood.cpp  | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index d3ff236d..de0ae706 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -1004,9 +1004,9 @@ bool hasOrphanedTops(const RoseBuildImpl &build) {
 
     for (auto v : vertices_range(g)) {
         if (g[v].left) {
-            set<u32> &tops = leftfixes[g[v].left];
             if (!build.isRootSuccessor(v)) {
                 // Tops for infixes come from the in-edges.
+                set<u32> &tops = leftfixes[g[v].left];
                 for (const auto &e : in_edges_range(v, g)) {
                     tops.insert(g[e].rose_top);
                 }
diff --git a/src/util/dump_charclass.cpp b/src/util/dump_charclass.cpp
index df308dec..2243fcbd 100644
--- a/src/util/dump_charclass.cpp
+++ b/src/util/dump_charclass.cpp
@@ -178,9 +178,9 @@ size_t describeClassInt(ostream &os, const CharReach &incr, size_t maxLength,
 
     // Render charclass as a series of ranges
     size_t c_start = cr.find_first();
-    size_t c = c_start, c_last = 0;
+    size_t c = c_start;
     while (c != CharReach::npos) {
-        c_last = c;
+        size_t c_last = c;
         c = cr.find_next(c);
         if (c != c_last + 1 || c_last == 0xff) {
             describeRange(os, c_start, c_last, out_type);
diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp
index 81afbeaa..77d3ff47 100644
--- a/unit/internal/fdr_flood.cpp
+++ b/unit/internal/fdr_flood.cpp
@@ -488,7 +488,6 @@ TEST_P(FDRFloodp, StreamingMask) {
                                         Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-        hwlm_error_t fdrStatus;
         const u32 cnt4 = dataSize - 4 + 1;
 
         for (u32 streamChunk = 1; streamChunk <= 16; streamChunk *= 2) {
@@ -496,7 +495,7 @@ TEST_P(FDRFloodp, StreamingMask) {
             const u8 *d = data.data();
             // reference past the end of fake history to allow headroom
             const u8 *fhist = fake_history.data() + fake_history_size;
-            fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0,
+            hwlm_error_t fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0,
                                          countCallback, &scratch,
                                          HWLM_ALL_GROUPS);
             ASSERT_EQ(0, fdrStatus);

From b5bf3d8d31a456ff37fb0e47ee0985660d93e53d Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 30 Apr 2024 13:36:39 +0300
Subject: [PATCH 50/56] unreadVariable

---
 benchmarks/benchmarks.cpp            | 2 +-
 src/nfa/castle.c                     | 1 -
 src/nfa/castlecompile.cpp            | 3 ++-
 src/rose/counting_miracle.h          | 2 +-
 src/rose/rose_build_add_mask.cpp     | 1 -
 unit/internal/multi_bit_compress.cpp | 6 +++---
 util/cross_compile.cpp               | 2 +-
 7 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index 3c1a0bab..1cf31c5f 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -108,7 +108,7 @@ static void run_benchmarks(int size, int loops, int max_matches,
         u64a total_size = 0;
         auto start = std::chrono::steady_clock::now();
         for (int i = 0; i < loops; i++) {
-            const u8 *res = func(bench);
+            func(bench);
         }
         auto end = std::chrono::steady_clock::now();
         total_sec +=
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 355d7951..35b202a2 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -906,7 +906,6 @@ s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
         if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
             return (s64a)loc - hlen;
         }
-        ep = 0;
     }
 
     return sp - 1; /* the repeats are never killed */
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 56b12700..bef36451 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -655,7 +655,8 @@ buildCastle(const CastleProto &proto,
     if (!stale_iter.empty()) {
         c->staleIterOffset = verify_u32(ptr - base_ptr);
         copy_bytes(ptr, stale_iter);
-        ptr += byte_length(stale_iter);
+        // Removed unused increment operation
+        // ptr += byte_length(stale_iter);
     }
 
     return nfa;
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index 602907cb..3bd93509 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -192,7 +192,7 @@ int roseCountingMiracleOccurs(const struct RoseEngine *t,
 
     u32 count = 0;
 
-    s64a m_loc = start;
+    s64a m_loc;
 
     if (!cm->shufti) {
         u8 c = cm->c;
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index 7e0e00b0..6c84c40a 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -131,7 +131,6 @@ void findMaskLiteral(const vector<CharReach> &mask, bool streaming,
     if (better) {
         best_begin = begin;
         best_end = end;
-        best_len = len;
     }
 
     for (size_t i = best_begin; i < best_end; i++) {
diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp
index e0ec475c..dd5b7b7a 100644
--- a/unit/internal/multi_bit_compress.cpp
+++ b/unit/internal/multi_bit_compress.cpp
@@ -401,7 +401,7 @@ TEST_P(MultiBitCompTest, CompCompressDecompressDense) {
 
 TEST(MultiBitComp, CompIntegration1) {
     // 256 + 1 --> smallest 2-level mmbit
-    u32 total_size = mmbit_size(257);
+    //u32 total_size = mmbit_size(257);
     mmbit_holder ba(257);
 
     //-------------------- 1 -----------------------//
@@ -516,7 +516,7 @@ TEST(MultiBitComp, CompIntegration1) {
 
 TEST(MultiBitComp, CompIntegration2) {
     // 64^2 + 1 --> smallest 3-level mmbit
-    u32 total_size = mmbit_size(4097);
+    //u32 total_size = mmbit_size(4097);
     mmbit_holder ba(4097);
 
     //-------------------- 1 -----------------------//
@@ -645,7 +645,7 @@ TEST(MultiBitComp, CompIntegration2) {
 
 TEST(MultiBitComp, CompIntegration3) {
     // 64^3 + 1 --> smallest 4-level mmbit
-    u32 total_size = mmbit_size(262145);
+    //u32 total_size = mmbit_size(262145);
     mmbit_holder ba(262145);
 
     //-------------------- 1 -----------------------//
diff --git a/util/cross_compile.cpp b/util/cross_compile.cpp
index df2aff5a..8ce1e2f0 100644
--- a/util/cross_compile.cpp
+++ b/util/cross_compile.cpp
@@ -55,7 +55,7 @@ unique_ptr<hs_platform_info> xcompileReadMode(const char *s) {
     assert(!err);
 
     string str(s);
-    string mode = str.substr(0, str.find(":"));
+    //string mode = str.substr(0, str.find(":"));
     string opt = str.substr(str.find(":")+1, str.npos);
     bool found_mode = false;
 

From f2d8d637939303c3bdd4205ac3a077c4f90e2a29 Mon Sep 17 00:00:00 2001
From: Yoan Picchi <yoan.picchi@arm.com>
Date: Thu, 4 Apr 2024 10:00:58 +0000
Subject: [PATCH 51/56] Add sheng tests

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 unit/CMakeLists.txt     |   1 +
 unit/internal/sheng.cpp | 709 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 710 insertions(+)
 create mode 100644 unit/internal/sheng.cpp

diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index f5577d40..e2196459 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -102,6 +102,7 @@ set(unit_internal_SOURCES
     internal/rvermicelli.cpp
     internal/simd_utils.cpp
     internal/supervector.cpp
+    internal/sheng.cpp
     internal/shuffle.cpp
     internal/shufti.cpp
     internal/state_compress.cpp
diff --git a/unit/internal/sheng.cpp b/unit/internal/sheng.cpp
new file mode 100644
index 00000000..e8e45ac5
--- /dev/null
+++ b/unit/internal/sheng.cpp
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2024, Arm ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "gtest/gtest.h"
+#include "nfa/shengcompile.h"
+#include "nfa/rdfa.h"
+#include "util/bytecode_ptr.h"
+#include "util/compile_context.h"
+#include "util/report_manager.h"
+
+extern "C" {
+    #include "hs_compile.h"
+    #include "nfa/nfa_api.h"
+    #include "nfa/nfa_api_queue.h"
+    #include "nfa/nfa_api_util.h"
+    #include "nfa/nfa_internal.h"
+    #include "nfa/rdfa.h"
+    #include "nfa/sheng.h"
+    #include "ue2common.h"
+}
+
+namespace {
+
+struct callback_context {
+    unsigned int period;
+    unsigned int match_count;
+    unsigned int pattern_length;
+};
+
+int dummy_callback(u64a start, u64a end, ReportID id, void *context) {
+    (void) context;
+    printf("callback %llu %llu %u\n", start, end, id);
+    return 1; /* 0 stops matching, !0 continue */
+}
+
+int periodic_pattern_callback(u64a start, u64a end, ReportID id, void *raw_context) {
+    struct callback_context *context = (struct callback_context*) raw_context;
+    (void) start;
+    (void) id;
+    EXPECT_EQ(context->period * context->match_count, end - context->pattern_length);
+    context->match_count++;
+    return 1; /* 0 stops matching, !0 continue */
+}
+
+/**
+ * @brief Fill the state matrix with a diagonal pattern: accept the Nth character to go to the N+1 state
+ */
+static void fill_straight_regex_sequence(struct ue2::raw_dfa *dfa, int start_state, int end_state, int state_count)
+{
+    for (int state = start_state; state < end_state; state++) {
+        dfa->states[state].next.assign(state_count ,1);
+        dfa->states[state].next[0] = 2;
+        dfa->states[state].next[1] = 2;
+        dfa->states[state].next[state] = state+1;
+    }
+}
+
+static void init_raw_dfa16(struct ue2::raw_dfa *dfa, const ReportID rID)
+{
+    dfa->start_anchored = 1;
+    dfa->start_floating = 1;
+    dfa->alpha_size = 8;
+
+    int nb_state = 8;
+    for(int i = 0; i < nb_state; i++) {
+        struct ue2::dstate state(dfa->alpha_size);
+        state.next = std::vector<ue2::dstate_id_t>(nb_state);
+        state.daddy = 0;
+        state.impl_id = i; /* id of the state */
+        state.reports = ue2::flat_set<ReportID>();
+        state.reports_eod = ue2::flat_set<ReportID>();
+        dfa->states.push_back(state);
+    }
+
+    /* add a report to every accept state */
+    dfa->states[7].reports.insert(rID);
+
+    /**
+     * [a,b][c-e]{3}of
+     * (1) -a,b-> (2) -c,d,e-> (3) -c,d,e-> (4) -c,d,e-> (5) -o-> (6) -f-> ((7))
+     * (0) = dead
+     */
+
+    for(int i = 0; i < ue2::ALPHABET_SIZE; i++) {
+        dfa->alpha_remap[i] = 0;
+    }
+
+    dfa->alpha_remap['a'] = 0;
+    dfa->alpha_remap['b'] = 1;
+    dfa->alpha_remap['c'] = 2;
+    dfa->alpha_remap['d'] = 3;
+    dfa->alpha_remap['e'] = 4;
+    dfa->alpha_remap['o'] = 5;
+    dfa->alpha_remap['f'] = 6;
+    dfa->alpha_remap[256] = 7; /* for some reason there's a check that run on dfa->alpha_size-1 */
+
+                        /* a b c d e o f */
+    dfa->states[0].next = {0,0,0,0,0,0,0};
+    dfa->states[1].next = {2,2,1,1,1,1,1};      /* nothing */
+    dfa->states[2].next = {2,2,3,3,3,1,1};      /* [a,b] */
+    dfa->states[3].next = {2,2,4,4,4,1,1};      /* [a,b][c-e]{1} */
+    dfa->states[4].next = {2,2,5,5,5,1,1};      /* [a,b][c-e]{2} */
+    fill_straight_regex_sequence(dfa, 5, 7, 7); /* [a,b][c-e]{3}o */
+    dfa->states[7].next = {2,2,1,1,1,1,1};      /* [a,b][c-e]{3}of */
+}
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+/* We need more than 16 states to run sheng32, so make the graph longer */
+static void init_raw_dfa32(struct ue2::raw_dfa *dfa, const ReportID rID)
+{
+    dfa->start_anchored = 1;
+    dfa->start_floating = 1;
+    dfa->alpha_size = 18;
+
+    int nb_state = 18;
+    for(int i = 0; i < nb_state; i++) {
+        struct ue2::dstate state(dfa->alpha_size);
+        state.next = std::vector<ue2::dstate_id_t>(nb_state);
+        state.daddy = 0;
+        state.impl_id = i; /* id of the state */
+        state.reports = ue2::flat_set<ReportID>();
+        state.reports_eod = ue2::flat_set<ReportID>();
+        dfa->states.push_back(state);
+    }
+
+    /* add a report to every accept state */
+    dfa->states[17].reports.insert(rID);
+
+    /**
+     * [a,b][c-e]{3}of0123456789
+     * (1) -a,b-> (2) -c,d,e-> (3) -c,d,e-> (4) -c,d,e-> (5) -o-> (6) -f-> (7) -<numbers>-> ((17))
+     * (0) = dead
+     */
+
+    for(int i = 0; i < ue2::ALPHABET_SIZE; i++) {
+        dfa->alpha_remap[i] = 0;
+    }
+
+    dfa->alpha_remap['a'] = 0;
+    dfa->alpha_remap['b'] = 1;
+    dfa->alpha_remap['c'] = 2;
+    dfa->alpha_remap['d'] = 3;
+    dfa->alpha_remap['e'] = 4;
+    dfa->alpha_remap['o'] = 5;
+    dfa->alpha_remap['f'] = 6;
+    // maps 0 to 9
+    for (int i = 0; i < 10; i ++) {
+        dfa->alpha_remap[i + '0'] = i + 7;
+    }
+    dfa->alpha_remap[256] = 17; /* for some reason there's a check that run on dfa->alpha_size-1 */
+
+                         /* a b c d e o f 0 1 2 3 4 5 6 7 8 9 */
+    dfa->states[0].next  = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    dfa->states[1].next  = {2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};  /* nothing */
+    dfa->states[2].next  = {2,2,3,3,3,1,1,1,1,1,1,1,1,1,1,1,1};  /* [a,b] */
+    dfa->states[3].next  = {2,2,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1};  /* [a,b][c-e]{1} */
+    dfa->states[4].next  = {2,2,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1};  /* [a,b][c-e]{2} */
+    fill_straight_regex_sequence(dfa, 5, 17, 17);                /* [a,b][c-e]{3}of012345678 */
+    dfa->states[17].next = {2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};  /* [a,b][c-e]{3}of0123456789 */
+}
+#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */
+
+typedef ue2::bytecode_ptr<NFA> (*sheng_compile_ptr)(ue2::raw_dfa&,
+                            const ue2::CompileContext&,
+                            const ue2::ReportManager&,
+                            bool,
+                            std::set<ue2::dstate_id_t>*);
+
+typedef void (*init_raw_dfa_ptr)(struct ue2::raw_dfa*, const ReportID);
+
+
+static inline void init_nfa(struct NFA **out_nfa, sheng_compile_ptr compile_function, init_raw_dfa_ptr init_dfa_function) {
+    ue2::Grey *g = new ue2::Grey();
+    hs_platform_info plat_info = {0, 0, 0, 0};
+    ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g);
+    ue2::ReportManager *rm = new ue2::ReportManager(*g);
+    ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0);
+    ReportID rID = rm->getInternalId(*report);
+    rm->setProgramOffset(0, 0);
+
+    struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX);
+    init_dfa_function(dfa, rID);
+
+    *out_nfa = (compile_function(*dfa, *cc, *rm, false, nullptr)).release();
+    ASSERT_NE(nullptr, *out_nfa);
+
+    delete report;
+    delete rm;
+    delete cc;
+    delete g;
+}
+
+static void init_nfa16(struct NFA **out_nfa) {
+    init_nfa(out_nfa, ue2::shengCompile, init_raw_dfa16);
+}
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+static void init_nfa32(struct NFA **out_nfa) {
+    init_nfa(out_nfa, ue2::sheng32Compile, init_raw_dfa32);
+}
+#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */
+
+static char state_buffer;
+
+static inline void init_sheng_queue(struct mq **out_q, uint8_t *buffer, size_t max_size, void (*init_nfa_func)(struct NFA **out_nfa) ) {
+    struct NFA* nfa;
+    init_nfa_func(&nfa);
+    assert(nfa);
+
+    struct mq *q = new mq();
+
+    memset(q, 0, sizeof(struct mq));
+    q->nfa = nfa;
+    q->state = &state_buffer;
+    q->cb = dummy_callback;
+    q->buffer = buffer;
+    q->length = max_size; /* setting this as the max length scanable */
+
+    if (nfa != q->nfa) {
+        printf("Something went wrong while initializing sheng.\n");
+    }
+    nfaQueueInitState(nfa, q);
+    pushQueueAt(q, 0, MQE_START, 0);
+    pushQueueAt(q, 1, MQE_END, q->length );
+
+    *out_q = q;
+}
+
+static void init_sheng_queue16(struct mq **out_q, uint8_t *buffer ,size_t max_size) {
+    init_sheng_queue(out_q, buffer, max_size, init_nfa16);
+}
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+static void init_sheng_queue32(struct mq **out_q, uint8_t *buffer, size_t max_size) {
+    init_sheng_queue(out_q, buffer, max_size, init_nfa32);
+}
+#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */
+
+static
+void fill_pattern(u8* buf, size_t buffer_size, unsigned int start_offset, unsigned int period, const char *pattern, unsigned int pattern_length) {
+    memset(buf, '_', buffer_size);
+
+    for (unsigned int i = 0; i < buffer_size - 8; i+= 8) {
+        /* filling with some junk, including some character used for a valid state, to prevent the use of shufti */
+        memcpy(buf + i, "jgohcxbf", 8); 
+    }
+
+    for (unsigned int i = start_offset; i < buffer_size - pattern_length; i += period) {
+        memcpy(buf + i, pattern, pattern_length);
+    }
+}
+
+/* Generate ground truth to compare to */
+struct NFA *get_expected_nfa_header(u8 type, unsigned int length, unsigned int nposition) {
+    struct NFA *expected_nfa_header = new struct NFA();
+    memset(expected_nfa_header, 0, sizeof(struct NFA));
+    expected_nfa_header->length = length;
+    expected_nfa_header->type = type;
+    expected_nfa_header->nPositions = nposition;
+    expected_nfa_header->scratchStateSize = 1;
+    expected_nfa_header->streamStateSize = 1;
+    return expected_nfa_header;
+}
+
+struct NFA *get_expected_nfa16_header() {
+    return get_expected_nfa_header(SHENG_NFA, 4736, 8);
+}
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+struct NFA *get_expected_nfa32_header() {
+    return get_expected_nfa_header(SHENG_NFA_32, 17216, 18);
+}
+#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */
+
+void test_nfa_equal(const NFA& l, const NFA& r)
+{
+    EXPECT_EQ(l.flags, r.flags);
+    EXPECT_EQ(l.length, r.length);
+    EXPECT_EQ(l.type, r.type);
+    EXPECT_EQ(l.rAccelType, r.rAccelType);
+    EXPECT_EQ(l.rAccelOffset, r.rAccelOffset);
+    EXPECT_EQ(l.maxBiAnchoredWidth, r.maxBiAnchoredWidth);
+    EXPECT_EQ(l.rAccelData.dc, r.rAccelData.dc);
+    EXPECT_EQ(l.queueIndex, r.queueIndex);
+    EXPECT_EQ(l.nPositions, r.nPositions);
+    EXPECT_EQ(l.scratchStateSize, r.scratchStateSize);
+    EXPECT_EQ(l.streamStateSize, r.streamStateSize);
+    EXPECT_EQ(l.maxWidth, r.maxWidth);
+    EXPECT_EQ(l.minWidth, r.minWidth);
+    EXPECT_EQ(l.maxOffset, r.maxOffset);
+}
+
+/* Start of actual tests */
+
+/* 
+ * Runs shengCompile and compares its outputs to previously recorded outputs.
+ */
+TEST(Sheng16, std_compile_header) {
+
+    ue2::Grey *g = new ue2::Grey();
+    hs_platform_info plat_info = {0, 0, 0, 0};
+    ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g);
+    ue2::ReportManager *rm = new ue2::ReportManager(*g);
+    ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0);
+    ReportID rID = rm->getInternalId(*report);
+    rm->setProgramOffset(0, 0);
+
+    struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX);
+    init_raw_dfa16(dfa, rID);
+
+    struct NFA *nfa = (shengCompile(*dfa, *cc, *rm, false)).release();
+    EXPECT_NE(nullptr, nfa);
+
+    EXPECT_NE(0, nfa->length);
+    EXPECT_EQ(SHENG_NFA, nfa->type);
+
+    struct NFA *expected_nfa = get_expected_nfa16_header();
+    test_nfa_equal(*expected_nfa, *nfa);
+
+    delete expected_nfa;
+    delete report;
+    delete rm;
+    delete cc;
+    delete g;
+}
+
+/*
+ * nfaExecSheng_B is the most basic of the sheng variants. It simply calls the core of the algorithm.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng16, std_run_B) {
+    struct mq *q;
+    unsigned int pattern_length = 6;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+    struct callback_context context = {period, 0, pattern_length};
+
+    struct NFA* nfa;
+    init_nfa16(&nfa);
+    ASSERT_NE(nullptr, nfa);
+    fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length);
+    char ret_val;
+    unsigned int offset = 0;
+    unsigned int loop_count = 0;
+    for (; loop_count < expected_matches + 1; loop_count++) {
+        ASSERT_LT(offset, buf_size);
+        ret_val = nfaExecSheng_B(nfa,
+                                offset,
+                                buf + offset,
+                                (s64a) buf_size - offset,
+                                periodic_pattern_callback,
+                                &context);
+        offset = (context.match_count - 1) * context.period + context.pattern_length;
+        if(unlikely(ret_val != MO_ALIVE)) {
+            break;
+        }
+    }
+
+    /*check normal return*/
+    EXPECT_EQ(MO_ALIVE, ret_val);
+
+    /*check that we don't find additional match nor crash when no match are found*/
+    EXPECT_EQ(expected_matches + 1, loop_count);
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, context.match_count);
+}
+
+/*
+ * nfaExecSheng_Q runs like the _B version (callback), but exercises the message queue logic.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng16, std_run_Q) {
+    struct mq *q;
+    unsigned int pattern_length = 6;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+    struct callback_context context = {period, 0, pattern_length};
+
+    init_sheng_queue16(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length);
+    q->cur = 0;
+    q->items[q->cur].location = 0;
+    q->context = &context;
+    q->cb = periodic_pattern_callback;
+
+    nfaExecSheng_Q(q->nfa, q, (s64a) buf_size);
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, context.match_count);
+
+    delete q;
+}
+
+/*
+ * nfaExecSheng_Q2 uses the message queue, but stops at match instead of using a callback.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng16, std_run_Q2) {
+    struct mq *q;
+    unsigned int pattern_length = 6;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+
+    init_sheng_queue16(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length);
+    q->cur = 0;
+    q->items[q->cur].location = 0;
+
+    char ret_val;
+    int location;
+    unsigned int loop_count = 0;
+    do {
+        ret_val = nfaExecSheng_Q2(q->nfa, q, (s64a) buf_size);
+        location = q->items[q->cur].location;
+        loop_count++;
+    } while(likely((ret_val == MO_MATCHES_PENDING) && (location < (int)buf_size) && ((location % period) == pattern_length)));
+
+    /*check if it's a spurious match*/
+    EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && ((location % period) != pattern_length));
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, loop_count-1);
+
+    delete q;
+}
+
+/*
+ * The message queue can also run on the "history" buffer. We test it the same way as the normal 
+ * buffer, expecting the same behavior.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng16, history_run_Q2) {
+    struct mq *q;
+    unsigned int pattern_length = 6;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+
+    init_sheng_queue16(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length);
+    q->history = buf;
+    q->hlength = buf_size;
+    q->cur = 0;
+    q->items[q->cur].location = -200;
+
+    char ret_val;
+    int location;
+    unsigned int loop_count = 0;
+    do {
+        ret_val = nfaExecSheng_Q2(q->nfa, q, 0);
+        location = q->items[q->cur].location;
+        loop_count++;
+    } while(likely((ret_val == MO_MATCHES_PENDING) && (location > -(int)buf_size) && (location < 0) && (((buf_size + location) % period) == pattern_length)));
+
+    /*check if it's a spurious match*/
+    EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && (((buf_size + location) % period) != pattern_length));
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, loop_count-1);
+
+    delete q;
+}
+
+/**
+ * Those tests only covers the basic paths. More tests can cover:
+ * - running for history buffer to current buffer in Q2
+ * - running while expecting no match
+ * - nfaExecSheng_QR
+ * - run sheng when it should call an accelerator and confirm it call them
+ */
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
+
+/* 
+ * Runs sheng32Compile and compares its outputs to previously recorded outputs.
+ */
+TEST(Sheng32, std_compile_header) {
+#if defined(HAVE_SVE)
+    if(svcntb()<32) {
+        return;
+    }
+#endif
+    ue2::Grey *g = new ue2::Grey();
+    hs_platform_info plat_info = {0, 0, 0, 0};
+    ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g);
+    ue2::ReportManager *rm = new ue2::ReportManager(*g);
+    ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0);
+    ReportID rID = rm->getInternalId(*report);
+    rm->setProgramOffset(0, 0);
+
+    struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX);
+    init_raw_dfa32(dfa, rID);
+
+    struct NFA *nfa = (sheng32Compile(*dfa, *cc, *rm, false)).release();
+    EXPECT_NE(nullptr, nfa);
+
+    EXPECT_NE(0, nfa->length);
+    EXPECT_EQ(SHENG_NFA_32, nfa->type);
+
+    struct NFA *expected_nfa = get_expected_nfa32_header();
+    test_nfa_equal(*expected_nfa, *nfa);
+
+    delete expected_nfa;
+    delete report;
+    delete rm;
+    delete cc;
+    delete g;
+}
+
+/*
+ * nfaExecSheng32_B is the most basic of the sheng variants. It simply calls the core of the algorithm.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng32, std_run_B) {
+#if defined(HAVE_SVE)
+    if(svcntb()<32) {
+        return;
+    }
+#endif
+    struct mq *q;
+    unsigned int pattern_length = 16;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+    struct callback_context context = {period, 0, pattern_length};
+
+    struct NFA* nfa;
+    init_nfa32(&nfa);
+    ASSERT_NE(nullptr, nfa);
+    fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length);
+    char ret_val;
+    unsigned int offset = 0;
+    unsigned int loop_count = 0;
+    for (; loop_count < expected_matches + 1; loop_count++) {
+        ASSERT_LT(offset, buf_size);
+        ret_val = nfaExecSheng32_B(nfa,
+                                offset,
+                                buf + offset,
+                                (s64a) buf_size - offset,
+                                periodic_pattern_callback,
+                                &context);
+        offset = (context.match_count - 1) * context.period + context.pattern_length;
+        if(unlikely(ret_val != MO_ALIVE)) {
+            break;
+        }
+    }
+
+    /*check normal return*/
+    EXPECT_EQ(MO_ALIVE, ret_val);
+
+    /*check that we don't find additional match nor crash when no match are found*/
+    EXPECT_EQ(expected_matches + 1, loop_count);
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, context.match_count);
+}
+
+/*
+ * nfaExecSheng32_Q runs like the _B version (callback), but exercises the message queue logic.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng32, std_run_Q) {
+#if defined(HAVE_SVE)
+    if(svcntb()<32) {
+        return;
+    }
+#endif
+    struct mq *q;
+    unsigned int pattern_length = 16;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+    struct callback_context context = {period, 0, pattern_length};
+
+    init_sheng_queue32(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length);
+    q->cur = 0;
+    q->items[q->cur].location = 0;
+    q->context = &context;
+    q->cb = periodic_pattern_callback;
+
+    nfaExecSheng32_Q(q->nfa, q, (s64a) buf_size);
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, context.match_count);
+
+    delete q;
+}
+
+/*
+ * nfaExecSheng32_Q2 uses the message queue, but stops at match instead of using a callback.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng32, std_run_Q2) {
+#if defined(HAVE_SVE)
+    if(svcntb()<32) {
+        return;
+    }
+#endif
+    struct mq *q;
+    unsigned int pattern_length = 16;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+
+    init_sheng_queue32(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length);
+    q->cur = 0;
+    q->items[q->cur].location = 0;
+
+    char ret_val;
+    int location;
+    unsigned int loop_count = 0;
+    do {
+        ret_val = nfaExecSheng32_Q2(q->nfa, q, (s64a) buf_size);
+        location = q->items[q->cur].location;
+        loop_count++;
+    } while(likely((ret_val == MO_MATCHES_PENDING) && (location < (int)buf_size) && ((location % period) == pattern_length)));
+
+    /*check if it's a spurious match*/
+    EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && ((location % period) != pattern_length));
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, loop_count-1);
+
+    delete q;
+}
+
+/*
+ * The message queue can also runs on the "history" buffer. We test it the same way as the normal 
+ * buffer, expecting the same behavior.
+ * We test it with a buffer having a few matches at fixed intervals and check that it finds them all.
+ */
+TEST(Sheng32, history_run_Q2) {
+#if defined(HAVE_SVE)
+    if(svcntb()<32) {
+        return;
+    }
+#endif
+    struct mq *q;
+    unsigned int pattern_length = 16;
+    unsigned int period = 128;
+    const size_t buf_size = 200;
+    unsigned int expected_matches = buf_size/128 + 1;
+    u8 buf[buf_size];
+
+    init_sheng_queue32(&q, buf, buf_size);
+    fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length);
+    q->history = buf;
+    q->hlength = buf_size;
+    q->cur = 0;
+    q->items[q->cur].location = -200;
+
+    char ret_val;
+    int location;
+    unsigned int loop_count = 0;
+    do {
+        ret_val = nfaExecSheng32_Q2(q->nfa, q, 0);
+        location = q->items[q->cur].location;
+        loop_count++;
+    } while(likely((ret_val == MO_MATCHES_PENDING) && (location > -(int)buf_size) && (location < 0) && (((buf_size + location) % period) == pattern_length)));
+
+    /*check if it's a spurious match*/
+    EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && (((buf_size + location) % period) != pattern_length));
+
+    /*check that we have all the matches*/
+    EXPECT_EQ(expected_matches, loop_count-1);
+
+    delete q;
+}
+#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */
+
+} /* namespace */
\ No newline at end of file

From 9902ca0e34f1c480e9fbafbac6f1fcb453e7788b Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 1 May 2024 10:54:15 +0300
Subject: [PATCH 52/56] addressing 47 [constParameterReference],48
 [constVariableReference],58 [constVariable]

---
 benchmarks/benchmarks.cpp              |  2 +-
 examples/patbench.cc                   |  4 +-
 examples/pcapscan.cc                   |  2 +-
 src/fdr/fdr_confirm_compile.cpp        |  2 +-
 src/fdr/teddy_compile.cpp              |  2 +-
 src/nfa/goughcompile.cpp               |  2 +-
 src/nfa/goughcompile_reg.cpp           |  2 +-
 src/nfa/mcclellancompile.cpp           | 12 ++---
 src/nfa/mcsheng_compile.cpp            |  4 +-
 src/nfa/shengcompile.cpp               |  8 +--
 src/nfa/truffle_simd.hpp               |  4 +-
 src/nfagraph/ng.cpp                    |  2 +-
 src/nfagraph/ng_anchored_dots.cpp      |  4 +-
 src/nfagraph/ng_equivalence.cpp        |  6 +--
 src/nfagraph/ng_extparam.cpp           |  2 +-
 src/nfagraph/ng_fuzzy.cpp              |  2 +-
 src/nfagraph/ng_literal_component.cpp  |  2 +-
 src/nfagraph/ng_puff.cpp               |  4 +-
 src/nfagraph/ng_redundancy.cpp         |  8 +--
 src/nfagraph/ng_repeat.cpp             |  4 +-
 src/nfagraph/ng_som.cpp                | 10 ++--
 src/nfagraph/ng_som_add_redundancy.cpp |  2 +-
 src/nfagraph/ng_uncalc_components.cpp  |  4 +-
 src/nfagraph/ng_violet.cpp             | 18 +++----
 src/parser/ComponentRepeat.cpp         |  2 +-
 src/parser/buildstate.cpp              |  2 +-
 src/rose/rose_build_add.cpp            |  4 +-
 src/rose/rose_build_bytecode.cpp       | 18 +++----
 src/rose/rose_build_exclusive.cpp      |  4 +-
 src/rose/rose_build_groups.cpp         |  2 +-
 src/rose/rose_build_merge.cpp          |  8 +--
 src/rose/rose_build_role_aliasing.cpp  |  8 +--
 src/som/slot_manager.cpp               |  2 +-
 src/util/clique.cpp                    |  2 +-
 tools/hsbench/engine_hyperscan.cpp     | 10 ++--
 unit/hyperscan/logical_combination.cpp | 72 +++++++++++++-------------
 unit/hyperscan/multi.cpp               | 34 ++++++------
 unit/internal/nfagraph_common.h        |  2 +-
 unit/internal/state_compress.cpp       |  8 +--
 util/ng_find_matches.cpp               |  2 +-
 40 files changed, 146 insertions(+), 146 deletions(-)

diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
index 91cab3f8..489dfeb5 100644
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@@ -129,7 +129,7 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
 }
 
 int main(){
-    int matches[] = {0, MAX_MATCHES};
+    const int matches[] = {0, MAX_MATCHES};
     std::vector<size_t> sizes;
     for (size_t i = 0; i < N; i++) sizes.push_back(16000 << i*2);
     const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa"; 
diff --git a/examples/patbench.cc b/examples/patbench.cc
index 1f965f13..dece8d78 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -389,7 +389,7 @@ public:
     // Close all open Hyperscan streams (potentially generating any
     // end-anchored matches)
     void closeStreams() {
-        for (auto &stream : streams) {
+        for (const auto &stream : streams) {
             hs_error_t err =
                 hs_close_stream(stream, scratch, onMatch, &matchCount);
             if (err != HS_SUCCESS) {
@@ -565,7 +565,7 @@ double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
 }
 
 static
-double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
+double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
                 unsigned repeatCount, Criterion criterion,
                 bool diagnose = true) {
     double compileTime = 0;
diff --git a/examples/pcapscan.cc b/examples/pcapscan.cc
index bd5493a4..1ec98d78 100644
--- a/examples/pcapscan.cc
+++ b/examples/pcapscan.cc
@@ -281,7 +281,7 @@ public:
     // Close all open Hyperscan streams (potentially generating any
     // end-anchored matches)
     void closeStreams() {
-        for (auto &stream : streams) {
+        for (const auto &stream : streams) {
             hs_error_t err = hs_close_stream(stream, scratch, onMatch,
                                              &matchCount);
             if (err != HS_SUCCESS) {
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 75b237b0..7f4ad7ec 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -159,7 +159,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
     map<u32, vector<LiteralIndex> > res2lits;
     hwlm_group_t gm = 0;
     for (LiteralIndex i = 0; i < lits.size(); i++) {
-        LitInfo & li = tmpLitInfo[i];
+        const LitInfo & li = tmpLitInfo[i];
         u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
         DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
         res2lits[hash].emplace_back(i);
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index e7398b6f..23b70bb7 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -622,7 +622,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 static
 bool assignStringsToBuckets(
                 const vector<hwlmLiteral> &lits,
-                TeddyEngineDescription &eng,
+                const TeddyEngineDescription &eng,
                 map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
     assert(eng.numMasks <= MAX_NUM_MASKS);
     if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 59ef052f..62cdf72c 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -1017,7 +1017,7 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
         verts[gbs.gg[v].state_id] = v;
     }
 
-    for (auto &m : gbs.built_accel) {
+    for (const auto &m : gbs.built_accel) {
         gough_accel *ga = m.first;
         assert(!ga->prog_offset);
         GoughVertex v = verts[m.second];
diff --git a/src/nfa/goughcompile_reg.cpp b/src/nfa/goughcompile_reg.cpp
index d088e1c0..4798a9c3 100644
--- a/src/nfa/goughcompile_reg.cpp
+++ b/src/nfa/goughcompile_reg.cpp
@@ -438,7 +438,7 @@ void create_slot_mapping(const GoughGraph &cfg, UNUSED u32 old_slot_count,
 }
 
 static
-void update_local_slots(GoughGraph &g, set<GoughSSAVar *> &locals,
+void update_local_slots(GoughGraph &g, const set<GoughSSAVar *> &locals,
                         u32 local_base) {
     DEBUG_PRINTF("%zu local variables\n", locals.size());
     /* local variables only occur on edges (joins are never local) */
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 36545e98..d0e30573 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -361,7 +361,7 @@ struct raw_report_list {
     raw_report_list(const flat_set<ReportID> &reports_in,
                     const ReportManager &rm, bool do_remap) {
         if (do_remap) {
-            for (auto &id : reports_in) {
+            for (const auto &id : reports_in) {
                 reports.insert(rm.getProgramOffset(id));
             }
         } else {
@@ -540,7 +540,7 @@ size_t calcWideRegionSize(const dfa_info &info) {
 static
 void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
                const vector<u32> &reports, const vector<u32> &reports_eod,
-               vector<u32> &reportOffsets) {
+               const vector<u32> &reportOffsets) {
     const dstate &raw_state = info.states[i];
     aux->accept = raw_state.reports.empty() ? 0 : reportOffsets[reports[i]];
     aux->accept_eod = raw_state.reports_eod.empty() ? 0
@@ -794,8 +794,8 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
         }
 
         for (size_t i : order) {
-            vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
-            vector<symbol_t> &symbol_chain = info.wide_symbol_chain[i];
+            const vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
+            const vector<symbol_t> &symbol_chain = info.wide_symbol_chain[i];
 
             u16 width = verify_u16(symbol_chain.size());
             *(u16 *)(curr_wide_entry + WIDE_WIDTH_OFFSET) = width;
@@ -1367,11 +1367,11 @@ bool store_chain_longest(vector<vector<dstate_id_t>> &candidate_chain,
 /* \brief Generate wide_symbol_chain from wide_state_chain. */
 static
 void generate_symbol_chain(dfa_info &info, vector<symbol_t> &chain_tail) {
-    raw_dfa &rdfa = info.raw;
+    const raw_dfa &rdfa = info.raw;
     assert(chain_tail.size() == info.wide_state_chain.size());
 
     for (size_t i = 0; i < info.wide_state_chain.size(); i++) {
-        vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
+        const vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
         vector<symbol_t> symbol_chain;
 
         info.extra[state_chain[0]].wideHead = true;
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
index 622362be..81db7024 100644
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -955,7 +955,7 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
 }
 
 static
-void fill_in_sherman(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+void fill_in_sherman(NFA *nfa, const dfa_info &info, UNUSED u16 sherman_limit) {
     char *nfa_base = (char *)nfa;
     mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
     char *sherman_table = nfa_base + m->sherman_offset;
@@ -1109,7 +1109,7 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
 }
 
 static
-void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+void fill_in_sherman64(NFA *nfa, const dfa_info &info, UNUSED u16 sherman_limit) {
     char *nfa_base = (char *)nfa;
     mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
     char *sherman_table = nfa_base + m->sherman_offset;
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 055e1971..caf7df3f 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -99,7 +99,7 @@ struct dfa_info {
         return next(idx, TOP);
     }
     dstate &next(dstate_id_t idx, u16 chr) {
-        auto &src = (*this)[idx];
+        const auto &src = (*this)[idx];
         auto next_id = src.next[raw.alpha_remap[chr]];
         return states[next_id];
     }
@@ -109,7 +109,7 @@ struct dfa_info {
         // if DFA can't die, shift all indices left by 1
         return can_die ? idx : idx + 1;
     }
-    bool isDead(dstate &state) {
+    bool isDead(const dstate &state) {
         return raw_id(state.impl_id) == DEAD_STATE;
     }
     bool isDead(dstate_id_t idx) {
@@ -117,7 +117,7 @@ struct dfa_info {
     }
 
 private:
-    static bool dfaCanDie(raw_dfa &rdfa) {
+    static bool dfaCanDie(const raw_dfa &rdfa) {
         for (unsigned chr = 0; chr < 256; chr++) {
             for (dstate_id_t state = 0; state < rdfa.states.size(); state++) {
                 auto succ = rdfa.states[state].next[rdfa.alpha_remap[chr]];
@@ -138,7 +138,7 @@ struct raw_report_list {
     raw_report_list(const flat_set<ReportID> &reports_in,
                     const ReportManager &rm, bool do_remap) {
         if (do_remap) {
-            for (auto &id : reports_in) {
+            for (const auto &id : reports_in) {
                 reports.insert(rm.getProgramOffset(id));
             }
         } else {
diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp
index f7dbc6bb..e63180d0 100644
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -227,7 +227,7 @@ const u8 *fwdBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_ma
 }
 
 template <uint16_t S>
-const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) {
+const u8 *truffleExecReal(const m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) {
     assert(buf && buf_end);
     assert(buf < buf_end);
     DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
@@ -349,4 +349,4 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse
 
     return buf - 1;
 }
-#endif //HAVE_SVE
\ No newline at end of file
+#endif //HAVE_SVE
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index b2a87523..9f8faba5 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -292,7 +292,7 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
 
 // Returns true if all components have been added.
 static
-bool processComponents(NG &ng, ExpressionInfo &expr,
+bool processComponents(NG &ng, const ExpressionInfo &expr,
                        deque<unique_ptr<NGHolder>> &g_comp,
                        const som_type som) {
     const u32 num_components = g_comp.size();
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index 9a13376d..fd39fb01 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -257,7 +257,7 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
 
 static
 void reformUnanchoredRepeatsComponent(NGHolder &g,
-                                      set<NFAVertex> &compAnchoredStarts,
+                                      const set<NFAVertex> &compAnchoredStarts,
                                       set<NFAVertex> &compUnanchoredStarts,
                                       set<NFAVertex> &dead,
                                       depth *startBegin, depth *startEnd) {
@@ -555,7 +555,7 @@ void collapseVariableRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
 }
 
 static
-void addDotsBetween(NGHolder &g, NFAVertex lhs, vector<NFAVertex> &rhs,
+void addDotsBetween(NGHolder &g, NFAVertex lhs, const vector<NFAVertex> &rhs,
                     depth min_repeat, depth max_repeat) {
     const bool unbounded = max_repeat.is_infinite();
     if (unbounded) {
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index 7bfe3c93..c45096c5 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -159,7 +159,7 @@ public:
         return id;
     }
 
-    void append(WorkQueue &other) {
+    void append(const WorkQueue &other) {
         for (const auto &e : other) {
             push(e);
         }
@@ -193,7 +193,7 @@ private:
 }
 
 static
-bool outIsIrreducible(NFAVertex &v, const NGHolder &g) {
+bool outIsIrreducible(const NFAVertex &v, const NGHolder &g) {
     unsigned nonSpecialVertices = 0;
     for (auto w : adjacent_vertices_range(v, g)) {
         if (!is_special(w, g) && w != v) {
@@ -204,7 +204,7 @@ bool outIsIrreducible(NFAVertex &v, const NGHolder &g) {
 }
 
 static
-bool inIsIrreducible(NFAVertex &v, const NGHolder &g) {
+bool inIsIrreducible(const NFAVertex &v, const NGHolder &g) {
     unsigned nonSpecialVertices = 0;
     for (auto u : inv_adjacent_vertices_range(v, g)) {
         if (!is_special(u, g) && u != v) {
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index 65e30a14..32c1c002 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -430,7 +430,7 @@ NFAVertex findSingleCyclic(const NGHolder &g) {
 }
 
 static
-bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g,
+bool hasOffsetAdjust(const ReportManager &rm, const NGHolder &g,
                      int *adjust) {
     const auto &reports = all_reports(g);
     if (reports.empty()) {
diff --git a/src/nfagraph/ng_fuzzy.cpp b/src/nfagraph/ng_fuzzy.cpp
index 78fd8629..20c76288 100644
--- a/src/nfagraph/ng_fuzzy.cpp
+++ b/src/nfagraph/ng_fuzzy.cpp
@@ -582,7 +582,7 @@ private:
 
         // set up all reports
         bool clone = false;
-        for (auto &pair : reports_to_vertices) {
+        for (const auto &pair : reports_to_vertices) {
             const auto &reports = pair.first;
             const auto &vertices = pair.second;
 
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index 4d3965df..dfda0838 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -98,7 +98,7 @@ void addToString(string &s, const NGHolder &g, NFAVertex v) {
 }
 
 static
-bool splitOffLiteral(NG &ng, NGHolder &g, NFAVertex v, const bool anchored,
+bool splitOffLiteral(NG &ng, const NGHolder &g, NFAVertex v, const bool anchored,
                      set<NFAVertex> &dead) {
     DEBUG_PRINTF("examine vertex %zu\n", g[v].index);
     bool nocase = false, casefixed = false;
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 9b03f4c0..8f130eaa 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -241,7 +241,7 @@ u32 allowedSquashDistance(const CharReach &cr, u32 min_width, const NGHolder &g,
 /** Gives a stronger puff trigger when the trigger is connected to a wide
  * cyclic state (aside from sds) */
 static
-void improveHead(NGHolder &g, NFAVertex *a, vector<NFAVertex> *nodes) {
+void improveHead(const NGHolder &g, NFAVertex *a, vector<NFAVertex> *nodes) {
     DEBUG_PRINTF("attempting to improve puff trigger\n");
     assert(!nodes->empty());
     const CharReach &puff_cr = g[nodes->back()].char_reach;
@@ -260,7 +260,7 @@ void improveHead(NGHolder &g, NFAVertex *a, vector<NFAVertex> *nodes) {
 }
 
 static
-void constructPuff(NGHolder &g, const NFAVertex a, const NFAVertex puffv,
+void constructPuff(const NGHolder &g, const NFAVertex a, const NFAVertex puffv,
                    const CharReach &cr, const ReportID report, u32 width,
                    bool fixed_depth, bool unbounded, bool auto_restart,
                    RoseBuild &rose, ReportManager &rm,
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index a499a40d..5d31d558 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -323,7 +323,7 @@ bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
 
     bool changed = false;
     for (auto v : vertices_range(g)) {
-        VertexInfo &info = infoMap[v];
+        const VertexInfo &info = infoMap[v];
 
         if (info.isRemoved) {
             continue;
@@ -439,7 +439,7 @@ bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
                 continue; // Conservatively skip anything with nonzero tops.
             }
 
-            CharReach &otherReach = g[t].char_reach;
+            const CharReach &otherReach = g[t].char_reach;
             if (currReach.isSubsetOf(otherReach)) {
                 DEBUG_PRINTF("removing redundant vertex %zu (keeping %zu)\n",
                              g[v].index, g[t].index);
@@ -745,7 +745,7 @@ u32 findCyclic(const NGHolder &g, vector<bool> &cyclic) {
 }
 
 static
-void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
+void findCyclicDom(const NGHolder &g, vector<bool> &cyclic,
                    set<NFAEdge> &dead, som_type som) {
     auto dominators = findDominators(g);
 
@@ -789,7 +789,7 @@ void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
 }
 
 static
-void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
+void findCyclicPostDom(const NGHolder &g, vector<bool> &cyclic,
                        set<NFAEdge> &dead) {
     auto postdominators = findPostDominators(g);
 
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index 2aa31808..71c95fce 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -1874,7 +1874,7 @@ void buildFeeder(NGHolder &g, const BoundedRepeatData &rd,
  * offset.
  */
 static
-bool improveLeadingRepeat(NGHolder &g, BoundedRepeatData &rd,
+bool improveLeadingRepeat(NGHolder &g, const BoundedRepeatData &rd,
                           unordered_set<NFAVertex> &created,
                           const vector<BoundedRepeatData> &all_repeats) {
     assert(edge(g.startDs, g.startDs, g).second);
@@ -1944,7 +1944,7 @@ bool improveLeadingRepeat(NGHolder &g, BoundedRepeatData &rd,
 }
 
 static
-vector<NFAVertex> makeOwnStraw(NGHolder &g, BoundedRepeatData &rd,
+vector<NFAVertex> makeOwnStraw(NGHolder &g, const BoundedRepeatData &rd,
                                const vector<NFAVertex> &straw) {
     // Straw runs from startDs to our pos trigger.
     assert(!straw.empty());
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 359fa17b..9d854701 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -1177,7 +1177,7 @@ void expandGraph(NGHolder &g, unordered_map<NFAVertex, u32> &regions,
 }
 
 static
-bool doTreePlanningIntl(NGHolder &g,
+bool doTreePlanningIntl(const NGHolder &g,
             const unordered_map<NFAVertex, u32> &regions,
             const map<u32, region_info> &info,
             map<u32, region_info>::const_iterator picked, u32 bad_region,
@@ -1855,7 +1855,7 @@ bool doSomRevNfa(NG &ng, NGHolder &g, const CompileContext &cc) {
 }
 
 static
-u32 doSomRevNfaPrefix(NG &ng, const ExpressionInfo &expr, NGHolder &g,
+u32 doSomRevNfaPrefix(NG &ng, const ExpressionInfo &expr, const NGHolder &g,
                       const CompileContext &cc) {
     depth maxWidth = findMaxWidth(g);
 
@@ -2011,7 +2011,7 @@ void setReportOnHaigPrefix(RoseBuild &rose, NGHolder &h) {
 }
 
 static
-bool tryHaig(RoseBuild &rose, NGHolder &g,
+bool tryHaig(RoseBuild &rose, const NGHolder &g,
              const unordered_map<NFAVertex, u32> &regions,
              som_type som, u32 somPrecision,
              map<u32, region_info>::const_iterator picked,
@@ -2442,7 +2442,7 @@ void makeReportsSomPass(ReportManager &rm, NGHolder &g) {
 }
 
 static
-bool doLitHaigSom(NG &ng, NGHolder &g, som_type som) {
+bool doLitHaigSom(NG &ng, const NGHolder &g, som_type som) {
     ue2_literal lit;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
     if (!rhs) {
@@ -2659,7 +2659,7 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
 }
 
 static
-bool doMultiLitHaigSom(NG &ng, NGHolder &g, som_type som) {
+bool doMultiLitHaigSom(NG &ng, const NGHolder &g, som_type som) {
     set<ue2_literal> lits;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
     if (!ng.cc.grey.allowLitHaig) {
diff --git a/src/nfagraph/ng_som_add_redundancy.cpp b/src/nfagraph/ng_som_add_redundancy.cpp
index 871679d9..f8f953be 100644
--- a/src/nfagraph/ng_som_add_redundancy.cpp
+++ b/src/nfagraph/ng_som_add_redundancy.cpp
@@ -112,7 +112,7 @@ bool forkVertex(NFAVertex v, NGHolder &g, vector<DepthMinMax> &depths,
     }
     *numNewVertices += predGroups.size();
 
-    for (auto &group : predGroups) {
+    for (const auto &group : predGroups) {
         const depth &predDepth = group.first;
         const vector<NFAEdge> &preds = group.second;
 
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index a10673e6..fc4ffe1a 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -454,8 +454,8 @@ void buildNfaMergeQueue(const vector<NGHolder *> &cluster,
                 }
             }
 
-            NGHolder &g_i = *(cluster[ci]);
-            NGHolder &g_j = *(cluster[cj]);
+            const NGHolder &g_i = *(cluster[ci]);
+            const NGHolder &g_j = *(cluster[cj]);
 
             if (!compatibleStarts(g_i, g_j)) {
                 continue;
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 02461e98..a9e099ee 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -688,7 +688,7 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
     }
 
     if (seeking_transient) {
-        for (auto &a : lits) {
+        for (const auto &a : lits) {
             a->creates_transient
                 = createsTransientLHS(g, a->vv, *depths, cc.grey);
         }
@@ -697,7 +697,7 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
     if (last_chance) {
         const size_t num_verts = num_vertices(g);
         auto color_map = make_small_color_map(g);
-        for (auto &a : lits) {
+        for (const auto &a : lits) {
             size_t num_reachable = count_reachable(g, a->vv, color_map);
             double ratio = (double)num_reachable / (double)num_verts;
             a->split_ratio = ratio > 0.5 ? 1 - ratio : ratio;
@@ -1172,7 +1172,7 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
 #define MAX_LEN_2_LITERALS_PER_CUT 3
 
 static
-bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
+bool checkValidNetflowLits(const NGHolder &h, const vector<u64a> &scores,
                            const map<NFAEdge, set<ue2_literal>> &cut_lits,
                            u32 min_allowed_length) {
     DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
@@ -1209,7 +1209,7 @@ bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
 }
 
 static
-void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
+void splitEdgesByCut(const NGHolder &h, RoseInGraph &vg,
                      const vector<RoseInEdge> &to_cut,
                      const vector<NFAEdge> &cut,
                      const map<NFAEdge, set<ue2_literal>> &cut_lits) {
@@ -1805,7 +1805,7 @@ void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
 }
 
 static
-RoseInVertex getStart(RoseInGraph &vg) {
+RoseInVertex getStart(const RoseInGraph &vg) {
     for (RoseInVertex v : vertices_range(vg)) {
         if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
             return v;
@@ -1870,7 +1870,7 @@ unique_ptr<NGHolder> make_chain(u32 count) {
 #define SHORT_TRIGGER_LEN 16
 
 static
-bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
+bool makeTransientFromLongLiteral(const NGHolder &h, RoseInGraph &vg,
                                   const vector<RoseInEdge> &ee,
                                   const CompileContext &cc) {
     /* check max width and literal lengths to see if possible */
@@ -2150,7 +2150,7 @@ void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
 #define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
 
 static
-bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
+bool extractStrongLiteral(const NGHolder &h, RoseInGraph &vg,
                           const vector<RoseInEdge> &ee,
                           const CompileContext &cc) {
     DEBUG_PRINTF("looking for string literal\n");
@@ -2805,7 +2805,7 @@ bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) {
 }
 
 static
-vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg,
+vector<vector<CharReach>> getDfaTriggers(const RoseInGraph &vg,
                                          const vector<RoseInEdge> &edges,
                                          bool *single_trigger) {
     vector<vector<CharReach>> triggers;
@@ -2927,7 +2927,7 @@ bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
                               vector<RoseInEdge>> edges_by_graph;
         for (const RoseInEdge &ve : edges_range(vg)) {
             if (vg[ve].graph && !vg[ve].dfa) {
-                auto &h = vg[ve].graph;
+                const auto &h = vg[ve].graph;
                 edges_by_graph[h].emplace_back(ve);
             }
         }
diff --git a/src/parser/ComponentRepeat.cpp b/src/parser/ComponentRepeat.cpp
index 7090459f..01392990 100644
--- a/src/parser/ComponentRepeat.cpp
+++ b/src/parser/ComponentRepeat.cpp
@@ -110,7 +110,7 @@ void addBase(Position base, vector<PositionInfo> &firsts,
 }
 
 static
-void checkPositions(vector<PositionInfo> &v, const GlushkovBuildState &bs) {
+void checkPositions(const vector<PositionInfo> &v, const GlushkovBuildState &bs) {
     const NFABuilder& builder = bs.getBuilder();
     for (const auto &e : v) {
         if (builder.isSpecialState(e.pos)) {
diff --git a/src/parser/buildstate.cpp b/src/parser/buildstate.cpp
index 96f91cb6..66f389fb 100644
--- a/src/parser/buildstate.cpp
+++ b/src/parser/buildstate.cpp
@@ -242,7 +242,7 @@ Position makeNewlineAssertPos(GlushkovBuildState &bs) {
 static
 void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
                      vector<PositionInfo> *tolist) {
-    NFABuilder &builder = bs.getBuilder();
+    const NFABuilder &builder = bs.getBuilder();
     u32 flags = from.flags;
 
     bool require_eod = flags & POS_FLAG_WIRE_EOD;
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 5aed21f5..816acc16 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -353,7 +353,7 @@ void createVertices(RoseBuildImpl *tbi,
                          edge_props.minBound, edge_props.maxBound);
         }
 
-        for (auto &m : created) {
+        for (const auto &m : created) {
             tbi->ghost[m.second] = g_v;
         }
     }
@@ -938,7 +938,7 @@ void shift_accepts_to_end(const RoseInGraph &ig,
 }
 
 static
-void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
+void populateRoseGraph(RoseBuildImpl *tbi, const RoseBuildData &bd) {
     const RoseInGraph &ig = bd.ig;
 
     /* add the pattern in to the main rose graph */
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 06f36582..0283b4d1 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -674,7 +674,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
     }
 
     assert(suff.graph());
-    NGHolder &holder = *suff.graph();
+    const NGHolder &holder = *suff.graph();
     assert(holder.kind == NFA_SUFFIX);
     const bool oneTop = onlyOneTop(holder);
     bool compress_state = cc.streaming;
@@ -1378,7 +1378,7 @@ void updateExclusiveSuffixProperties(const RoseBuildImpl &build,
                                 const vector<ExclusiveInfo> &exclusive_info,
                                 set<u32> *no_retrigger_queues) {
     const RoseGraph &g = build.g;
-    for (auto &info : exclusive_info) {
+    for (const auto &info : exclusive_info) {
         const auto &qi = info.queue;
         const auto &subengines = info.subengines;
         bool no_retrigger = true;
@@ -1627,11 +1627,11 @@ public:
                             build.rm);
     }
 
-    bytecode_ptr<NFA> operator()(unique_ptr<NGHolder> &holder) const {
+    bytecode_ptr<NFA> operator()(const unique_ptr<NGHolder> &holder) const {
         const CompileContext &cc = build.cc;
         const ReportManager &rm = build.rm;
 
-        NGHolder &h = *holder;
+        const NGHolder &h = *holder;
         assert(h.kind == NFA_OUTFIX);
 
         // Build NFA.
@@ -1657,7 +1657,7 @@ public:
         return n;
     }
 
-    bytecode_ptr<NFA> operator()(UNUSED MpvProto &mpv) const {
+    bytecode_ptr<NFA> operator()(UNUSED const MpvProto &mpv) const {
         // MPV construction handled separately.
         assert(mpv.puffettes.empty());
         return nullptr;
@@ -2728,7 +2728,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-RoseProgram makeLiteralProgram(const RoseBuildImpl &build, build_context &bc,
+RoseProgram makeLiteralProgram(const RoseBuildImpl &build, const build_context &bc,
                                ProgramBuild &prog_build, u32 lit_id,
                                const vector<vector<RoseEdge>> &lit_edge_map,
                                bool is_anchored_replay_program) {
@@ -2973,7 +2973,7 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
                                             pfrag.lit_ids, lit_edge_map);
         if (pfrag.included_frag_id != INVALID_FRAG_ID &&
             !lit_prog.empty()) {
-            auto &cfrag = fragments[pfrag.included_frag_id];
+            const auto &cfrag = fragments[pfrag.included_frag_id];
             assert(pfrag.s.length() >= cfrag.s.length() &&
                    !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
             u32 child_offset = cfrag.lit_program_offset;
@@ -2992,7 +2992,7 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
                                                     pfrag.lit_ids);
         if (pfrag.included_delay_frag_id != INVALID_FRAG_ID &&
             !rebuild_prog.empty()) {
-            auto &cfrag = fragments[pfrag.included_delay_frag_id];
+            const auto &cfrag = fragments[pfrag.included_delay_frag_id];
             assert(pfrag.s.length() >= cfrag.s.length() &&
                    !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
             u32 child_offset = cfrag.delay_program_offset;
@@ -3011,7 +3011,7 @@ void updateLitProtoProgramOffset(vector<LitFragment> &fragments,
     auto &proto = *litProto.hwlmProto;
     for (auto &lit : proto.lits) {
         auto fragId = lit.id;
-        auto &frag = fragments[fragId];
+        const auto &frag = fragments[fragId];
         if (delay) {
             DEBUG_PRINTF("delay_program_offset:%u\n",
                          frag.delay_program_offset);
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
index bc9b1558..f6159c6a 100644
--- a/src/rose/rose_build_exclusive.cpp
+++ b/src/rose/rose_build_exclusive.cpp
@@ -118,7 +118,7 @@ bool addPrefixLiterals(NGHolder &h, unordered_set<u32> &tailId,
 
     for (auto v : adjacent_vertices_range(start, h)) {
         if (v != h.startDs) {
-            for (auto &t : tails) {
+            for (const auto &t : tails) {
                 add_edge(t, v, h);
             }
         }
@@ -126,7 +126,7 @@ bool addPrefixLiterals(NGHolder &h, unordered_set<u32> &tailId,
 
     clear_out_edges(start, h);
     add_edge(h.start, h.start, h);
-    for (auto &t : heads) {
+    for (const auto &t : heads) {
         add_edge(start, t, h);
     }
 
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index d8b9c951..94fab54f 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -275,7 +275,7 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
     // Second pass: the other literals.
     for (u32 id = 0; id < literals.size(); id++) {
         const rose_literal_id &lit = literals.at(id);
-        rose_literal_info &info = literal_info[id];
+        const rose_literal_info &info = literal_info[id];
 
         if (!requires_group_assignment(lit, info)) {
             continue;
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index cddbb760..ba638c4d 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1338,7 +1338,7 @@ void chunk(vector<T> in, vector<vector<T>> *out, size_t chunk_size) {
 }
 
 static
-insertion_ordered_map<left_id, vector<RoseVertex>> get_eng_verts(RoseGraph &g) {
+insertion_ordered_map<left_id, vector<RoseVertex>> get_eng_verts(const RoseGraph &g) {
     insertion_ordered_map<left_id, vector<RoseVertex>> eng_verts;
     for (auto v : vertices_range(g)) {
         const auto &left = g[v].left;
@@ -1924,7 +1924,7 @@ void mergeSmallLeftfixes(RoseBuildImpl &tbi) {
         }
 
         assert(left.graph());
-        NGHolder &h = *left.graph();
+        const NGHolder &h = *left.graph();
 
         /* Ensure that kind on the graph is correct */
         assert(h.kind == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));
@@ -2024,7 +2024,7 @@ void mergeCastleLeftfixes(RoseBuildImpl &build) {
         return;
     }
 
-    RoseGraph &g = build.g;
+    const RoseGraph &g = build.g;
 
     insertion_ordered_map<left_id, vector<RoseVertex>> eng_verts;
 
@@ -2306,7 +2306,7 @@ void mergeOutfixInfo(OutfixInfo &winner, const OutfixInfo &victim) {
 }
 
 static
-map<NGHolder *, NGHolder *> chunkedNfaMerge(RoseBuildImpl &build,
+map<NGHolder *, NGHolder *> chunkedNfaMerge(const RoseBuildImpl &build,
                                             const vector<NGHolder *> &nfas) {
     map<NGHolder *, NGHolder *> merged;
 
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 2888b9a0..7c32e34b 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1352,8 +1352,8 @@ bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
     assert(a != b);
 
     RoseGraph &g = build.g;
-    LeftEngInfo &a_left = g[a].left;
-    LeftEngInfo &b_left = g[b].left;
+    const LeftEngInfo &a_left = g[a].left;
+    const LeftEngInfo &b_left = g[b].left;
 
     // Trivial case.
     if (a_left == b_left) {
@@ -1601,7 +1601,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                       vector<RoseVertex> *dead, bool mergeRoses,
                       RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
-    RoseGraph &g = build.g;
+    const RoseGraph &g = build.g;
 
     if (candidates.empty()) {
         return;
@@ -1972,7 +1972,7 @@ bool hasNoDiamondSiblings(const RoseGraph &g, RoseVertex v) {
  * merge.
  */
 static
-void filterDiamondCandidates(RoseGraph &g, CandidateSet &candidates) {
+void filterDiamondCandidates(const RoseGraph &g, CandidateSet &candidates) {
     DEBUG_PRINTF("%zu candidates enter\n", candidates.size());
 
     vector<RoseVertex> dead;
diff --git a/src/som/slot_manager.cpp b/src/som/slot_manager.cpp
index 33b8d503..6808ac3c 100644
--- a/src/som/slot_manager.cpp
+++ b/src/som/slot_manager.cpp
@@ -187,7 +187,7 @@ u32 SomSlotManager::getInitialResetSomSlot(const NGHolder &prefix,
             find_if(reset.entries.begin(), reset.entries.end(),
                     has_prefix_func) != reset.entries.end();
 
-        for (auto &e : reset.entries) {
+        for (const auto &e : reset.entries) {
             u32 temp = 0;
             /* we don't need to test against sentinels which are identical to
              * our current one as races don't matter and we know it clears
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
index 19daed3c..5dbcc1d8 100644
--- a/src/util/clique.cpp
+++ b/src/util/clique.cpp
@@ -79,7 +79,7 @@ vector<u32> findCliqueGroup(CliqueGraph &cg) {
 
         // Choose a vertex from the graph
         u32 id = g[0];
-        CliqueVertex &n = vertexMap.at(id);
+        const CliqueVertex &n = vertexMap.at(id);
         clique.emplace_back(id);
         // Corresponding vertex in the original graph
         set<u32> subgraphId(g.begin(), g.end());
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 95461de5..1f84bab7 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -132,7 +132,7 @@ void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id,
                            ResultEntry &result, EngineContext &ectx) const {
     assert(data);
 
-    EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
+    const EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
     ScanHSContext sc(id, result, nullptr);
     auto callback = echo_matches ? onMatchEcho : onMatch;
     hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc);
@@ -150,7 +150,7 @@ void EngineHyperscan::scan_vectored(const char *const *data,
     assert(data);
     assert(len);
 
-    EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
+    const EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
     ScanHSContext sc(streamId, result, nullptr);
     auto callback = echo_matches ? onMatchEcho : onMatch;
     hs_error_t rv =
@@ -198,8 +198,8 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
                                  ResultEntry &result) const {
     assert(data);
 
-    auto &s = static_cast<EngineHSStream &>(stream);
-    EngineHSContext &ctx = *s.ctx;
+    const auto &s = static_cast<EngineHSStream &>(stream);
+    const EngineHSContext &ctx = *s.ctx;
 
     ScanHSContext sc(id, result, &s);
     auto callback = echo_matches ? onMatchEcho : onMatch;
@@ -215,7 +215,7 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
 void EngineHyperscan::streamCompressExpand(EngineStream &stream,
                                            vector<char> &temp) const {
     size_t used = 0;
-    auto &s = static_cast<EngineHSStream &>(stream);
+    const auto &s = static_cast<EngineHSStream &>(stream);
     hs_error_t err = hs_compress_stream(s.id, temp.data(), temp.size(),
                                         &used);
     if (err == HS_INSUFFICIENT_SPACE) {
diff --git a/unit/hyperscan/logical_combination.cpp b/unit/hyperscan/logical_combination.cpp
index 9558948f..20b6e5a1 100644
--- a/unit/hyperscan/logical_combination.cpp
+++ b/unit/hyperscan/logical_combination.cpp
@@ -45,8 +45,8 @@ TEST(LogicalCombination, SingleComb1) {
     string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 1001};
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
+    const unsigned ids[] = {101, 102, 103, 104, 105, 1001};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -92,9 +92,9 @@ TEST(LogicalCombination, SingleCombQuietSub1) {
     string data = "abcdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 1001};
+    const unsigned ids[] = {101, 102, 103, 104, 105, 1001};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -133,11 +133,11 @@ TEST(LogicalCombination, MultiCombQuietSub1) {
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)",
                           "!101 & 102", "!(!101 | 102)", "101 & !102"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
+    const unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -178,13 +178,13 @@ TEST(LogicalCombination, MultiHighlanderCombQuietSub1) {
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)",
                           "!101 & 102", "!(!101 | 102)", "101 & !102"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, 0,
                         HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH,
                         HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH,
                         HS_FLAG_COMBINATION | HS_FLAG_SINGLEMATCH};
-    unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
+    const unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -219,11 +219,11 @@ TEST(LogicalCombination, MultiQuietCombQuietSub1) {
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(101 & 102 & 103) | (104 & !105)",
                           "!101 & 102", "!(!101 | 102)", "101 & !102"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, 0, HS_FLAG_COMBINATION | HS_FLAG_QUIET,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION | HS_FLAG_QUIET};
-    unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
+    const unsigned ids[] = {101, 102, 103, 104, 105, 1001, 1002, 1003, 1004};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 9, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -255,8 +255,8 @@ TEST(LogicalCombination, SingleComb2) {
     string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(201 | 202 & 203) & (!204 | 205)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1002};
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1002};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -299,9 +299,9 @@ TEST(LogicalCombination, SingleCombQuietSub2) {
     string data = "abbdefxxfoobarrrghabcxdefxteakettleeeeexxxxijklmxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(201 | 202 & 203) & (!204 | 205)"};
-    unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET,
+    const unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1002};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1002};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -339,8 +339,8 @@ TEST(LogicalCombination, SingleComb3) {
     string data = "abcijklndefxxfoobarrrghabcxdefxteakettleeeeexxxxijklnxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
-    unsigned ids[] = {301, 302, 303, 304, 305, 1003};
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
+    const unsigned ids[] = {301, 302, 303, 304, 305, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -387,9 +387,9 @@ TEST(LogicalCombination, SingleCombQuietSub3) {
     string data = "abcijklndefxxfoobarrrghabcxdefxteakettleeeeexxxxijklnxxdef";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, HS_FLAG_COMBINATION};
-    unsigned ids[] = {301, 302, 303, 304, 305, 1003};
+    const unsigned ids[] = {301, 302, 303, 304, 305, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -429,9 +429,9 @@ TEST(LogicalCombination, MultiCombDupSub4) {
                           "ijkl[mMn]", "(201 & 202 & 203) | (204 & !205)",
                           "(201 | 202 & 203) & (!204 | 205)",
                           "((201 | 202) & 203) & (204 | 205)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION,
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 8, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -486,10 +486,10 @@ TEST(LogicalCombination, MultiCombQuietDupSub4) {
                           "ijkl[mMn]", "(201 & 202 & 203) | (204 & !205)",
                           "(201 | 202 & 203) & (!204 | 205)",
                           "((201 | 202) & 203) & (204 | 205)"};
-    unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0,
+    const unsigned flags[] = {HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0,
                         HS_FLAG_QUIET, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 8, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -546,10 +546,10 @@ TEST(LogicalCombination, MultiCombUniSub5) {
                           "(101 & 102 & 103) | (104 & !105)",
                           "(201 | 202 & 203) & (!204 | 205)",
                           "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    const unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
+    const unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
                       302, 303, 304, 305, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
@@ -636,12 +636,12 @@ TEST(LogicalCombination, MultiCombQuietUniSub5) {
                           "(101 & 102 & 103) | (104 & !105)",
                           "(201 | 202 & 203) & (!204 | 205)",
                           "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0,
+    const unsigned flags[] = {0, HS_FLAG_QUIET, HS_FLAG_QUIET, HS_FLAG_QUIET, 0,
                         HS_FLAG_QUIET, 0, HS_FLAG_QUIET, 0, HS_FLAG_QUIET,
                         HS_FLAG_QUIET, HS_FLAG_QUIET, 0, HS_FLAG_QUIET, 0,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
+    const unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
                       302, 303, 304, 305, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
@@ -702,8 +702,8 @@ TEST(LogicalCombination, SingleCombPurelyNegative6) {
     string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(!201 | 202 & 203) & (!204 | 205)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1002};
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1002};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -734,8 +734,8 @@ TEST(LogicalCombination, SingleCombQuietPurelyNegative6) {
     string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
     const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}",
                           "ijkl[mMn]", "(!201 | 202 & 203) & (!204 | 205)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION | HS_FLAG_QUIET};
-    unsigned ids[] = {201, 202, 203, 204, 205, 1002};
+    const unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION | HS_FLAG_QUIET};
+    const unsigned ids[] = {201, 202, 203, 204, 205, 1002};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -775,10 +775,10 @@ TEST(LogicalCombination, MultiCombPurelyNegativeUniSub6) {
                           "(101 & 102 & 103) | (!104 & !105)",
                           "(!201 | 202 & 203) & (!204 | 205)",
                           "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    const unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
+    const unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
                       302, 303, 304, 305, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
@@ -822,11 +822,11 @@ TEST(LogicalCombination, MultiCombPurelyNegativeUniSubEOD6) {
                           "(101 & 102 & 103) | (!104 & !105)",
                           "(!201 | 202 & 203) & (!204 | 205)",
                           "((301 | 302) & 303) & (304 | 305)"};
-    unsigned flags[] = {0, 0, 0, 0, 0, 0, HS_FLAG_MULTILINE,
+    const unsigned flags[] = {0, 0, 0, 0, 0, 0, HS_FLAG_MULTILINE,
                         0, 0, 0, 0, 0, 0, 0, 0,
                         HS_FLAG_COMBINATION, HS_FLAG_COMBINATION,
                         HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
+    const unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301,
                       302, 303, 304, 305, 1001, 1002, 1003};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
@@ -875,8 +875,8 @@ TEST(LogicalCombination, MultiCombStream1) {
                      "z"};
     const char *expr[] = {"abc", "def", "xyz", "zxyz",
                           "101 & 102", "201 & !202"};
-    unsigned flags[] = {0, 0, 0, 0, HS_FLAG_COMBINATION, HS_FLAG_COMBINATION};
-    unsigned ids[] = {101, 102, 201, 202, 1001, 1002};
+    const unsigned flags[] = {0, 0, 0, 0, HS_FLAG_COMBINATION, HS_FLAG_COMBINATION};
+    const unsigned ids[] = {101, 102, 201, 202, 1001, 1002};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_STREAM,
                                       nullptr, &db, &compile_err);
 
diff --git a/unit/hyperscan/multi.cpp b/unit/hyperscan/multi.cpp
index 85d8cd25..3a344fe5 100644
--- a/unit/hyperscan/multi.cpp
+++ b/unit/hyperscan/multi.cpp
@@ -44,8 +44,8 @@ TEST(MMAdaptor, norm_cont1) { // UE-901
     CallBackContext c;
     string data = "aooAaooAbarZ";
     const char *expr[] = {"aoo[A-K]", "bar[L-Z]"};
-    unsigned flags[] = {0, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {0, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -77,8 +77,8 @@ TEST(MMAdaptor, norm_cont2) {
     CallBackContext c;
     string data = "aooAaooAbarZ                      ";
     const char *expr[] = {"aoo[A-K][^\n]{16}", "bar[L-Z][^\n]{16}"};
-    unsigned flags[] = {0, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {0, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -110,8 +110,8 @@ TEST(MMAdaptor, norm_halt1) {
     CallBackContext c;
     string data = "aooAaooAbarZ";
     const char *expr[] = {"aoo[A-K]", "bar[L-Z]"};
-    unsigned flags[] = {0, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {0, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -141,8 +141,8 @@ TEST(MMAdaptor, norm_halt2) { // UE-901
     CallBackContext c;
     string data = "aooAaooAbarZ                      ";
     const char *expr[] = {"aoo[A-K][^\n]{16}", "bar[L-Z][^\n]{16}"};
-    unsigned flags[] = {0, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {0, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -172,8 +172,8 @@ TEST(MMAdaptor, high_cont1) { // UE-901
     CallBackContext c;
     string data = "aooAaooAbarZ";
     const char *expr[] = {"aoo[A-K]", "bar[L-Z]"};
-    unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -204,8 +204,8 @@ TEST(MMAdaptor, high_cont2) {
     CallBackContext c;
     string data = "aooAaooAbarZ                      ";
     const char *expr[] = {"aoo[A-K][^\n]{16}", "bar[L-Z][^\n]{16}"};
-    unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -236,8 +236,8 @@ TEST(MMAdaptor, high_halt1) {
     CallBackContext c;
     string data = "aooAaooAbarZ";
     const char *expr[] = {"aoo[A-K]", "bar[L-Z]"};
-    unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -267,8 +267,8 @@ TEST(MMAdaptor, high_halt2) {
     CallBackContext c;
     string data = "aooAaooAbarZbarZaooA                      ";
     const char *expr[] = {"aoo[A-K][^\n]{16}", "bar[L-Z][^\n]{16}"};
-    unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
-    unsigned ids[] = {30, 31};
+    const unsigned flags[] = {HS_FLAG_SINGLEMATCH, 0};
+    const unsigned ids[] = {30, 31};
     hs_error_t err = hs_compile_multi(expr, flags, ids, 2, HS_MODE_NOSTREAM,
                                       nullptr, &db, &compile_err);
 
@@ -342,7 +342,7 @@ TEST(MMRoseLiteralPath, issue_141) {
     const char *expr[] = {"/odezhda-dlya-bega/",
                           "kurtki-i-vetrovki-dlya-bega",
                           "futbolki-i-mayki-dlya-bega"};
-    unsigned flags[] = {HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+    const unsigned flags[] = {HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
                         HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
                         HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH};
     hs_error_t err = hs_compile_multi(expr, flags, nullptr, 3, HS_MODE_BLOCK,
diff --git a/unit/internal/nfagraph_common.h b/unit/internal/nfagraph_common.h
index ca5554c4..61ece377 100644
--- a/unit/internal/nfagraph_common.h
+++ b/unit/internal/nfagraph_common.h
@@ -41,7 +41,7 @@ namespace ue2 {
 // Helper function: construct a graph from an expression, flags and context.
 inline
 std::unique_ptr<NGHolder> constructGraphWithCC(const std::string &expr,
-                                               CompileContext &cc,
+                                               const CompileContext &cc,
                                                unsigned flags) {
     ReportManager rm(cc.grey);
     ParsedExpression parsed(0, expr.c_str(), flags, 0);
diff --git a/unit/internal/state_compress.cpp b/unit/internal/state_compress.cpp
index 00423702..d1442b4f 100644
--- a/unit/internal/state_compress.cpp
+++ b/unit/internal/state_compress.cpp
@@ -152,7 +152,7 @@ TEST(state_compress, m128_1) {
 TEST(state_compress, m128_2) {
     char buf[sizeof(m128)] = { 0 };
 
-    char val_raw[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
+    const char val_raw[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
                          '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
     m128 val;
     memcpy(&val, val_raw, sizeof(val));
@@ -228,7 +228,7 @@ TEST(state_compress, m256_1) {
 TEST(state_compress, m256_2) {
     char buf[sizeof(m256)] = { 0 };
 
-    char val_raw[32] = { '0', '1', '2', '3', '4', '5', '6', '7',
+    const char val_raw[32] = { '0', '1', '2', '3', '4', '5', '6', '7',
                          '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
                          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
                          'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P' };
@@ -306,7 +306,7 @@ TEST(state_compress, m384_1) {
 TEST(state_compress, m384_2) {
     char buf[sizeof(m384)] = { 0 };
 
-    char val_raw[48] = { '0', '1', '2', '3', '4', '5', '6', '7',
+    const char val_raw[48] = { '0', '1', '2', '3', '4', '5', '6', '7',
                          '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
                          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
                          'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
@@ -386,7 +386,7 @@ TEST(state_compress, m512_1) {
 TEST(state_compress, m512_2) {
     char buf[sizeof(m512)] = { 0 };
 
-    char val_raw[64] = { '0', '1', '2', '3', '4', '5', '6', '7',
+    const char val_raw[64] = { '0', '1', '2', '3', '4', '5', '6', '7',
                          '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
                          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
                          'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index c406ee95..56c38b97 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -861,7 +861,7 @@ bool isUtf8CodePoint(const char c) {
 }
 
 static
-bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) {
+bool canReach(const NGHolder &g, const NFAEdge &e, const struct fmstate &state) {
     auto flags = g[e].assert_flags;
     if (!flags) {
         return true;

From d3dd4486417ba8a9de8d51c406af8edec0f48f18 Mon Sep 17 00:00:00 2001
From: "G.E" <gregory.economou@vectorcamp.gr>
Date: Wed, 1 May 2024 11:22:32 +0300
Subject: [PATCH 53/56] the merge got screwed up, this should fix it

---
 src/rose/rose_build_bytecode.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 0d3d0352..765d9ca6 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2992,6 +2992,7 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
                                                     pfrag.lit_ids);
         if (pfrag.included_delay_frag_id != INVALID_FRAG_ID &&
             !rebuild_prog.empty()) {
+            const auto &cfrag = fragments[pfrag.included_frag_id];
             /** assert(pfrag.s.length() >= cfrag.s.length() && **/
             assert(pfrag.s.length() == cfrag.s.length() &&
                    !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());

From ea420114a7d6ea4447638dfb60cf0f2fc7f76e71 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 1 May 2024 12:59:34 +0300
Subject: [PATCH 54/56] knownConditionTrueFalse

---
 src/hs_valid_platform.c       |  1 +
 src/nfa/limex_compile.cpp     |  1 +
 src/rose/rose_build_merge.cpp | 19 +++++--------------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 74a8fc1e..3fa1b08d 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -48,6 +48,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
         return HS_ARCH_ERROR;
     }
 #elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+   //check_neon returns true for now
    if (check_neon()) {
         return HS_SUCCESS;
     } else {
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 2ec65552..5bc27301 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1481,6 +1481,7 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
                     continue;
                 }
                 u32 j = args.state_ids.at(w);
+                // j can be NO_STATE if args.state_ids.at(w) returns NO_STATE
                 if (j == NO_STATE) {
                     continue;
                 }
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 1e6c9222..c0df5765 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1718,9 +1718,8 @@ bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
 bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                          const deque<RoseVertex> &verts1) {
     map<u32, u32> top_mapping;
-    if (!setDistinctTops(h1, h2, top_mapping)) {
-        return false;
-    }
+
+    setDistinctTops(h1, h2, top_mapping);
 
     if (top_mapping.empty()) {
         return true; // No remapping necessary.
@@ -1748,9 +1747,7 @@ static
 bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                            const deque<RoseVertex> &verts1) {
     map<u32, u32> top_mapping;
-    if (!setDistinctTops(h1, h2, top_mapping)) {
-        return false;
-    }
+    setDistinctTops(h1, h2, top_mapping);
 
     if (top_mapping.empty()) {
         return true; // No remapping necessary.
@@ -1837,10 +1834,7 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, LeftfixBouquet &roses) {
                 }
             }
 
-            if (!setDistinctRoseTops(g, victim, *r1.graph(), verts2)) {
-                DEBUG_PRINTF("can't set distinct tops\n");
-                continue; // next h2
-            }
+            setDistinctRoseTops(g, victim, *r1.graph(), verts2);
 
             assert(victim.kind == r1.graph()->kind);
             assert(!generates_callbacks(*r1.graph()));
@@ -2120,10 +2114,7 @@ void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
                 old_tops[v] = g[v].suffix.top;
             }
 
-            if (!setDistinctSuffixTops(g, victim, *s1.graph(), verts2)) {
-                DEBUG_PRINTF("can't set distinct tops\n");
-                continue; // next h2
-            }
+            setDistinctSuffixTops(g, victim, *s1.graph(), verts2);
 
             if (!mergeNfaPair(victim, *s1.graph(), &tbi.rm, tbi.cc)) {
                 DEBUG_PRINTF("merge failed\n");

From 389b55c647da265a9ed68cffe7cbb6f93b6b7e99 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 1 May 2024 15:21:36 +0300
Subject: [PATCH 55/56] refactor bool to void setDistinctTops
 setDistinctRoseTops setDistinctSuffixTops

---
 src/rose/rose_build_merge.cpp         | 18 +++++++++---------
 src/rose/rose_build_merge.h           |  2 +-
 src/rose/rose_build_role_aliasing.cpp |  7 +------
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index c0df5765..21093a81 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1687,7 +1687,7 @@ void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
 }
 
 static
-bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
+void setDistinctTops(NGHolder &h1, const NGHolder &h2,
                      map<u32, u32> &top_mapping) {
     flat_set<u32> tops1 = getTops(h1), tops2 = getTops(h2);
 
@@ -1697,7 +1697,7 @@ bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
     // If our tops don't intersect, we're OK to merge with no changes.
     if (!has_intersection(tops1, tops2)) {
         DEBUG_PRINTF("tops don't intersect\n");
-        return true;
+        return ;
     }
 
     // Otherwise, we have to renumber the tops in h1 so that they don't overlap
@@ -1712,17 +1712,17 @@ bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
     }
 
     replaceTops(h1, top_mapping);
-    return true;
+    return ;
 }
 
-bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
+void setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                          const deque<RoseVertex> &verts1) {
     map<u32, u32> top_mapping;
 
     setDistinctTops(h1, h2, top_mapping);
 
     if (top_mapping.empty()) {
-        return true; // No remapping necessary.
+        return ; // No remapping necessary.
     }
 
     for (auto v : verts1) {
@@ -1740,17 +1740,17 @@ bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
         }
     }
 
-    return true;
+    return ;
 }
 
 static
-bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
+void setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                            const deque<RoseVertex> &verts1) {
     map<u32, u32> top_mapping;
     setDistinctTops(h1, h2, top_mapping);
 
     if (top_mapping.empty()) {
-        return true; // No remapping necessary.
+        return ; // No remapping necessary.
     }
 
     for (auto v : verts1) {
@@ -1760,7 +1760,7 @@ bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
         g[v].suffix.top = top_mapping[t];
     }
 
-    return true;
+    return ;
 }
 
 /** \brief Estimate the number of accel states in the given graph when built as
diff --git a/src/rose/rose_build_merge.h b/src/rose/rose_build_merge.h
index 6de6c778..e93a977f 100644
--- a/src/rose/rose_build_merge.h
+++ b/src/rose/rose_build_merge.h
@@ -62,7 +62,7 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
 bool mergeableRoseVertices(const RoseBuildImpl &tbi,
                            const std::set<RoseVertex> &v1,
                            const std::set<RoseVertex> &v2);
-bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
+void setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                          const std::deque<RoseVertex> &verts1);
 
 } // namespace ue2
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 2888b9a0..39721d6f 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1294,12 +1294,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
 
     DEBUG_PRINTF("victim %zu states\n", num_vertices(*a_h));
     DEBUG_PRINTF("winner %zu states\n", num_vertices(*b_h));
-
-    if (!setDistinctRoseTops(g, victim, *b_h, deque<RoseVertex>(1, a))) {
-        assert(roseHasTops(build, a));
-        assert(roseHasTops(build, b));
-        return false;
-    }
+    setDistinctRoseTops(g, victim, *b_h, deque<RoseVertex>(1, a));
 
     assert(victim.kind == b_h->kind);
     assert(!generates_callbacks(*b_h));

From a634d57b2d5844b15f3168453f0bbd76d93d67e3 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Thu, 2 May 2024 10:13:55 +0300
Subject: [PATCH 56/56] knownConditionTrueFalse fixes previously fp

---
 src/nfa/repeat.c                    |  4 ++--
 src/rose/rose_build_bytecode.cpp    | 16 ++++------------
 unit/internal/fdr_flood.cpp         | 18 ++++++------------
 unit/internal/flat_set.cpp          |  4 +---
 unit/internal/insertion_ordered.cpp |  4 +---
 5 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index 5b2e4df4..07d02082 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -785,7 +785,7 @@ enum RepeatMatch repeatHasMatchRange(const struct RepeatInfo *info,
     if (diff > info->repeatMax) {
         DEBUG_PRINTF("range list is stale\n");
         return REPEAT_STALE;
-    } else if (diff >= info->repeatMin && diff <= info->repeatMax) {
+    } else if (diff >= info->repeatMin) {
         return REPEAT_MATCH;
     }
 
@@ -836,7 +836,7 @@ enum RepeatMatch repeatHasMatchBitmap(const struct RepeatInfo *info,
     if (diff > info->repeatMax) {
         DEBUG_PRINTF("stale\n");
         return REPEAT_STALE;
-    } else if (diff >= info->repeatMin && diff <= info->repeatMax) {
+    } else if (diff >= info->repeatMin) {
         return REPEAT_MATCH;
     }
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 639edddc..181d77e5 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1495,7 +1495,7 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
 }
 
 static
-bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
+void buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
                     QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
                     set<u32> *eager_queues, bool do_prefix) {
     RoseGraph &g = tbi.g;
@@ -1581,7 +1581,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
                      leftfix);
     }
 
-    return true;
+    return ;
 }
 
 static
@@ -2059,16 +2059,8 @@ bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
     suffixTriggers.clear();
 
     *leftfixBeginQueue = qif.allocated_count();
-
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
-                        true)) {
-        return false;
-    }
-
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
-                        false)) {
-        return false;
-    }
+    buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,true);
+    buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,false);
 
     return true;
 }
diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp
index 77d3ff47..fd8a9734 100644
--- a/unit/internal/fdr_flood.cpp
+++ b/unit/internal/fdr_flood.cpp
@@ -154,7 +154,7 @@ TEST_P(FDRFloodp, NoMask) {
 
     struct hs_scratch scratch;
     scratch.fdr_conf = NULL;
-    while (1) {
+    while (c != 255) {
         SCOPED_TRACE((unsigned int)c);
         u8 bit = 1 << (c & 0x7);
         u8 cAlt = c ^ bit;
@@ -233,9 +233,7 @@ TEST_P(FDRFloodp, NoMask) {
         }
         matchesCounts.clear();
 
-        if (++c == 0) {
-            break;
-        }
+        ++c;
     }
 }
 
@@ -248,7 +246,7 @@ TEST_P(FDRFloodp, WithMask) {
 
     struct hs_scratch scratch;
     scratch.fdr_conf = NULL;
-    while (1) {
+    while (c != 255) {
         u8 bit = 1 << (c & 0x7);
         u8 cAlt = c ^ bit;
         SCOPED_TRACE((unsigned int)c);
@@ -396,9 +394,7 @@ TEST_P(FDRFloodp, WithMask) {
         }
         matchesCounts.clear();
 
-        if (++c == '\0') {
-            break;
-        }
+        ++c;
     }
 }
 
@@ -414,7 +410,7 @@ TEST_P(FDRFloodp, StreamingMask) {
 
     struct hs_scratch scratch;
     scratch.fdr_conf = NULL;
-    while (1) {
+    while (c != 255) {
         u8 bit = 1 << (c & 0x7);
         u8 cAlt = c ^ bit;
         SCOPED_TRACE((unsigned int)c);
@@ -548,9 +544,7 @@ TEST_P(FDRFloodp, StreamingMask) {
             }
         }
 
-        if (++c == '\0') {
-            break;
-        }
+        ++c;
     }
     matchesCounts.clear();
 }
diff --git a/unit/internal/flat_set.cpp b/unit/internal/flat_set.cpp
index 10607a6f..174a4771 100644
--- a/unit/internal/flat_set.cpp
+++ b/unit/internal/flat_set.cpp
@@ -48,9 +48,7 @@ std::ostream &operator<<(std::ostream &os, const flat_set<T> &f) {
     os << "{";
     for (auto it = begin(f); it != end(f); ++it) {
         os << *it;
-        if (it != end(f)) {
-            os << ", ";
-        }
+        os << ", ";
     }
     os << "}";
     return os;
diff --git a/unit/internal/insertion_ordered.cpp b/unit/internal/insertion_ordered.cpp
index 6026ce1d..2d799aa9 100644
--- a/unit/internal/insertion_ordered.cpp
+++ b/unit/internal/insertion_ordered.cpp
@@ -149,9 +149,7 @@ std::ostream &operator<<(std::ostream &os, const insertion_ordered_set<K> &s) {
     os << "{";
     for (auto it = begin(s); it != end(s); ++it) {
         os << *it;
-        if (it != end(s)) {
-            os << ", ";
-        }
+        os << ", ";
     }
     os << "}";
     return os;