Merge branch develop into master

2025-06-28 16:41:01 +03:00 · 2016-06-01 11:09:05 +10:00 · 2016-06-01 11:09:05 +10:00 · e3e0a0fab0
commit e3e0a0fab0
parent 0e5c4cbd1d 212ed92ac5
270 changed files with 21472 additions and 15494 deletions
--- a/.gitignore
+++ b/.gitignore
@ -46,10 +46,6 @@ sqlite3
 src/config.h
 src/config.h.in
 src/hs_version.h
 src/fdr/fdr_autogen.c
 src/fdr/fdr_autogen_compiler.cpp
 src/fdr/teddy_autogen.c
 src/fdr/teddy_autogen_compiler.cpp
 src/parser/Parser.cpp
 # Generated PCRE files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,40 @@
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 ## [4.2.0] 2016-05-31
 - Introduce an interpreter for many complex actions to replace the use of
  internal reports within the core of Hyperscan (the "Rose" engine). This
  improves scanning performance and reduces database size for many pattern
  sets.
 - Many enhancements to the acceleration framework used by NFA and DFA engines,
  including more flexible multibyte implementations and more AVX2 support. This
  improves scanning performance for many pattern sets.
 - Improved prefiltering support for complex patterns containing very large
  bounded repeats (`R{M,N}` with large `N`).
 - Improve scanning performance of pattern sets with a very large number of
  EOD-anchored patterns.
 - Improve scanning performance of large pattern sets that use the
  `HS_FLAG_SINGLEMATCH` flag.
 - Improve scanning performance of pattern sets that contain a single literal by
  improving the "Noodle" literal matcher.
 - Small reductions in total stream state for many pattern sets.
 - Improve runtime detection of AVX2 support.
 - Disable -Werror for release builds, in order to behave better for packagers
  and users with different compiler combinations than those that we test.
 - Improve support for building on Windows with MSVC 2015 (github issue #14).
  Support for Hyperscan on Windows is still experimental.
 - Small updates to fix warnings identified by Coverity.
 - Remove Python codegen for the "FDR" and "Teddy" literal matchers. These are
  now implemented directly in C code.
 - Remove the specialist "Sidecar" engine in favour of using our more general
  repeat engines.
 - New API function: add the `hs_expression_ext_info()` function. This is a
  variant of `hs_expression_info()` that can accept patterns with extended
  parameters.
 - New API error value: add the `HS_SCRATCH_IN_USE` error, which is returned
  when Hyperscan detects that a scratch region is already in use on entry to an
  API function.
 ## [4.1.0] 2015-12-18
 - Update version of PCRE used by testing tools as a syntax and semantic
  reference to PCRE 8.38.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 2.8.11)
 project (Hyperscan C CXX)
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 1)
+set (HS_MINOR_VERSION 2)
 set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
@ -75,7 +75,7 @@ if(NOT Boost_FOUND)
    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
    find_package(Boost ${BOOST_MINVERSION})
    if(NOT Boost_FOUND)
-        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
+        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
    endif()
 endif()
@ -115,7 +115,9 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
 endif()
 #for config
-set(HS_OPTIMIZE OPTIMISE)
+if (OPTIMISE)
    set(HS_OPTIMIZE ON)
 endif()
 CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
@ -171,8 +173,14 @@ else()
    endif()
    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -Werror")
+    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual")
-    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Werror -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
+    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
    if (NOT RELEASE_BUILD)
        # -Werror is most useful during development, don't potentially break
        # release builds
        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
    endif()
    if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
        message(STATUS "Building for current host CPU")
@ -229,6 +237,9 @@ if (RELEASE_BUILD)
    endif()
 endif()
 # ensure we are building for the right target arch
 include (${CMAKE_MODULE_PATH}/arch.cmake)
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@ -332,7 +343,7 @@ endif()
 add_subdirectory(util)
 add_subdirectory(unit)
 add_subdirectory(doc/dev-reference)
-if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
+if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
    add_subdirectory(tools)
 endif()
@ -340,8 +351,15 @@ endif()
 configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
 configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
-if (PKG_CONFIG_FOUND)
+if (NOT WIN32)
-    # we really only need to do this if we have pkg-config
+    # expand out library names for pkgconfig static link info
    foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
        # this is fragile, but protects us from toolchain specific files
        if (NOT EXISTS ${LIB})
            set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
        endif()
    endforeach()
    configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
    install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
            DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
@ -352,11 +370,6 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 # include the autogen targets
 add_subdirectory(src/fdr)
 include_directories(${PROJECT_BINARY_DIR}/src/fdr)
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
 endif()
@ -376,14 +389,13 @@ SET(hs_HEADERS
 )
 install(FILES ${hs_HEADERS} DESTINATION include/hs)
 set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
 set (hs_exec_SRCS
    ${hs_HEADERS}
    src/hs_version.h
    src/ue2common.h
    src/alloc.c
    src/allocator.h
    src/report.h
    src/runtime.c
    src/fdr/fdr.c
    src/fdr/fdr.h
@ -394,7 +406,9 @@ set (hs_exec_SRCS
    src/fdr/flood_runtime.h
    src/fdr/fdr_loadval.h
    src/fdr/teddy.c
    src/fdr/teddy.h
    src/fdr/teddy_internal.h
    src/fdr/teddy_runtime_common.h
    src/hwlm/hwlm.c
    src/hwlm/hwlm.h
    src/hwlm/hwlm_internal.h
@ -437,6 +451,25 @@ set (hs_exec_SRCS
    src/nfa/mpv.h
    src/nfa/mpv.c
    src/nfa/mpv_internal.h
    src/nfa/multiaccel_common.h
    src/nfa/multiaccel_doubleshift.h
    src/nfa/multiaccel_doubleshiftgrab.h
    src/nfa/multiaccel_long.h
    src/nfa/multiaccel_longgrab.h
    src/nfa/multiaccel_shift.h
    src/nfa/multiaccel_shiftgrab.h
    src/nfa/multishufti.c
    src/nfa/multishufti_avx2.h
    src/nfa/multishufti_sse.h
    src/nfa/multishufti.h
    src/nfa/multitruffle.c
    src/nfa/multitruffle_avx2.h
    src/nfa/multitruffle_sse.h
    src/nfa/multitruffle.h
    src/nfa/multivermicelli.c
    src/nfa/multivermicelli.h
    src/nfa/multivermicelli_sse.h
    src/nfa/multivermicelli_avx2.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_dispatch.c
    src/nfa/nfa_internal.h
@ -444,20 +477,17 @@ set (hs_exec_SRCS
    src/nfa/repeat.c
    src/nfa/repeat.h
    src/nfa/repeat_internal.h
    src/nfa/shufti_common.h
    src/nfa/shufti.c
    src/nfa/shufti.h
    src/nfa/truffle_common.h
    src/nfa/truffle.c
    src/nfa/truffle.h
    src/nfa/vermicelli.h
    src/nfa/vermicelli_run.h
    src/nfa/vermicelli_sse.h
    src/sidecar/sidecar.c
    src/sidecar/sidecar.h
    src/sidecar/sidecar_generic.h
    src/sidecar/sidecar_internal.h
    src/sidecar/sidecar_shufti.c
    src/sidecar/sidecar_shufti.h
    src/som/som.h
    src/som/som_operation.h
    src/som/som_runtime.h
    src/som/som_runtime.c
    src/som/som_stream.c
@ -473,10 +503,11 @@ set (hs_exec_SRCS
    src/rose/match.h
    src/rose/match.c
    src/rose/miracle.h
    src/rose/program_runtime.h
    src/rose/runtime.h
    src/rose/rose_sidecar_runtime.h
    src/rose/rose.h
    src/rose/rose_internal.h
    src/rose/rose_program.h
    src/rose/rose_types.h
    src/rose/rose_common.h
    src/util/bitutils.h
@ -484,7 +515,6 @@ set (hs_exec_SRCS
    src/util/fatbit.h
    src/util/fatbit.c
    src/util/join.h
    src/util/masked_move.c
    src/util/masked_move.h
    src/util/multibit.h
    src/util/multibit_internal.h
@ -498,6 +528,7 @@ set (hs_exec_SRCS
    src/util/shuffle_ssse3.h
    src/util/simd_utils.h
    src/util/simd_utils_ssse3.h
    src/util/simd_utils_ssse3.c
    src/util/state_compress.h
    src/util/state_compress.c
    src/util/unaligned.h
@ -510,6 +541,14 @@ set (hs_exec_SRCS
    src/database.h
 )
 if (HAVE_AVX2)
    set (hs_exec_SRCS
        ${hs_exec_SRCS}
        src/fdr/teddy_avx2.c
        src/util/masked_move.c
        )
 endif ()
 SET (hs_SRCS
    ${hs_HEADERS}
@ -574,6 +613,8 @@ SET (hs_SRCS
    src/nfa/mcclellan_internal.h
    src/nfa/mcclellancompile.cpp
    src/nfa/mcclellancompile.h
    src/nfa/mcclellancompile_accel.cpp
    src/nfa/mcclellancompile_accel.h
    src/nfa/mcclellancompile_util.cpp
    src/nfa/mcclellancompile_util.h
    src/nfa/limex_compile.cpp
@ -583,6 +624,8 @@ SET (hs_SRCS
    src/nfa/mpv_internal.h
    src/nfa/mpvcompile.cpp
    src/nfa/mpvcompile.h
    src/nfa/multiaccel_compilehelper.cpp
    src/nfa/multiaccel_compilehelper.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_queue.h
    src/nfa/nfa_api_util.h
@ -762,8 +805,6 @@ SET (hs_SRCS
    src/parser/unsupported.h
    src/parser/utf8_validate.h
    src/parser/utf8_validate.cpp
    src/sidecar/sidecar_compile.cpp
    src/sidecar/sidecar_compile.h
    src/smallwrite/smallwrite_build.cpp
    src/smallwrite/smallwrite_build.h
    src/smallwrite/smallwrite_internal.h
@ -771,6 +812,7 @@ SET (hs_SRCS
    src/som/slot_manager.h
    src/som/slot_manager_internal.h
    src/som/som.h
    src/som/som_operation.h
    src/rose/rose_build.h
    src/rose/rose_build_add.cpp
    src/rose/rose_build_add_internal.h
@ -778,6 +820,8 @@ SET (hs_SRCS
    src/rose/rose_build_anchored.cpp
    src/rose/rose_build_anchored.h
    src/rose/rose_build_bytecode.cpp
    src/rose/rose_build_castle.h
    src/rose/rose_build_castle.cpp
    src/rose/rose_build_compile.cpp
    src/rose/rose_build_convert.cpp
    src/rose/rose_build_convert.h
@ -786,6 +830,8 @@ SET (hs_SRCS
    src/rose/rose_build_infix.h
    src/rose/rose_build_lookaround.cpp
    src/rose/rose_build_lookaround.h
    src/rose/rose_build_matchers.cpp
    src/rose/rose_build_matchers.h
    src/rose/rose_build_merge.cpp
    src/rose/rose_build_merge.h
    src/rose/rose_build_misc.cpp
@ -799,6 +845,7 @@ SET (hs_SRCS
    src/rose/rose_in_graph.h
    src/rose/rose_in_util.cpp
    src/rose/rose_in_util.h
    src/util/accel_scheme.h
    src/util/alloc.cpp
    src/util/alloc.h
    src/util/bitfield.h
@ -820,7 +867,6 @@ SET (hs_SRCS
    src/util/dump_mask.cpp
    src/util/dump_mask.h
    src/util/graph.h
    src/util/internal_report.h
    src/util/multibit_build.cpp
    src/util/multibit_build.h
    src/util/order_check.h
@ -828,7 +874,6 @@ SET (hs_SRCS
    src/util/partitioned_set.h
    src/util/popcount.h
    src/util/queue_index_factory.h
    src/util/report.cpp
    src/util/report.h
    src/util/report_manager.cpp
    src/util/report_manager.h
@ -874,8 +919,6 @@ set(hs_dump_SRCS
    src/parser/dump.cpp
    src/parser/dump.h
    src/parser/position_dump.h
    src/sidecar/sidecar_dump.cpp
    src/sidecar/sidecar_dump.h
    src/smallwrite/smallwrite_dump.cpp
    src/smallwrite/smallwrite_dump.h
    src/som/slot_manager_dump.cpp
@ -901,11 +944,9 @@ set (LIB_VERSION ${HS_VERSION})
 set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
 add_library(hs_exec OBJECT ${hs_exec_SRCS})
 add_dependencies(hs_exec ${fdr_autogen_targets})
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
 add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
 add_dependencies(hs_exec_shared ${fdr_autogen_targets})
 set_target_properties(hs_exec_shared PROPERTIES
    POSITION_INDEPENDENT_CODE TRUE)
 endif()
@ -929,14 +970,16 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
        OUTPUT_NAME hs_runtime
        MACOSX_RPATH ON
        LINKER_LANGUAGE C)
-    install(TARGETS hs_runtime_shared DESTINATION lib)
+    install(TARGETS hs_runtime_shared
        RUNTIME DESTINATION bin
        ARCHIVE DESTINATION lib
        LIBRARY DESTINATION lib)
 endif()
 # we want the static lib for testing
 add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
 add_dependencies(hs ragel_Parser)
 add_dependencies(hs autogen_compiler autogen_teddy_compiler)
 if (NOT BUILD_SHARED_LIBS)
 install(TARGETS hs DESTINATION lib)
@ -945,13 +988,15 @@ endif()
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
    add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
    add_dependencies(hs_shared ragel_Parser)
    add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
    set_target_properties(hs_shared PROPERTIES
        OUTPUT_NAME hs
        VERSION ${LIB_VERSION}
        SOVERSION ${LIB_SOVERSION}
        MACOSX_RPATH ON)
-install(TARGETS hs_shared DESTINATION lib)
+install(TARGETS hs_shared
    RUNTIME DESTINATION bin
    ARCHIVE DESTINATION lib
    LIBRARY DESTINATION lib)
 endif()
 if(NOT WIN32)
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -0,0 +1,42 @@
 # detect architecture features
 #
 # must be called after determining where compiler intrinsics are defined
 if (HAVE_C_X86INTRIN_H)
    set (INTRIN_INC_H "x86intrin.h")
 elseif (HAVE_C_INTRIN_H)
    set (INTRIN_INC_H "intrin.h")
 else ()
    message (FATAL_ERROR "No intrinsics header found")
 endif ()
 set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
    __m128i a = _mm_set1_epi8(1);
    (void)_mm_shuffle_epi8(a, a);
 }" HAVE_SSSE3)
 if (NOT HAVE_SSSE3)
    message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
 endif ()
 # now look for AVX2
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 #if !defined(__AVX2__)
 #error no avx2
 #endif
 int main(){
    __m256i z = _mm256_setzero_si256();
    (void)_mm256_xor_si256(z, z);
 }" HAVE_AVX2)
 if (NOT HAVE_AVX2)
    message(STATUS "Building without AVX2 support")
 endif ()
 unset (CMAKE_REQUIRED_FLAGS)
 unset (INTRIN_INC_H)
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -15,9 +15,6 @@
 /* internal build, switch on dump support. */
 #cmakedefine DUMP_SUPPORT
 /* Build tools with threading support */
 #cmakedefine ENABLE_TOOLS_THREADS
 /* Define to 1 if `backtrace' works. */
 #cmakedefine HAVE_BACKTRACE
@ -39,10 +36,6 @@
 /* C compiler has intrin.h */
 #cmakedefine HAVE_C_INTRIN_H
 /* Define to 1 if you have the declaration of `pthread_barrier_init', and to 0
   if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_BARRIER_INIT
 /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
   0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
@ -59,9 +52,6 @@
 /* Define to 1 if `posix_memalign' works. */
 #cmakedefine HAVE_POSIX_MEMALIGN
 /* Define to 1 if you have the <pthread.h> header file. */
 #cmakedefine HAVE_PTHREAD_H
 /* Define to 1 if you have the `setrlimit' function. */
 #cmakedefine HAVE_SETRLIMIT
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -119,12 +119,21 @@ The following regex constructs are supported by Hyperscan:
 * The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and
  :regexp:`\\z`.
-* Option modifiers for:
+* Option modifiers:
-    * Case-sensitivity: :regexp:`(?i)` and :regexp:`(?-i)`
+  These allow behaviour to be switched on (with :regexp:`(?<option>)`) and off
-    * Multi-line: :regexp:`(?m)` and :regexp:`(?-m)`
+  (with :regexp:`(?-<option>)`) for a sub-pattern. The supported options are:
-    * Dot-all: :regexp:`(?s)` and :regexp:`(?-s)`
+
-    * Extended syntax: :regexp:`(?s)` and :regexp:`(?-s)`
+    * :regexp:`i`: Case-insensitive matching, as per
      :c:member:`HS_FLAG_CASELESS`.
    * :regexp:`m`: Multi-line matching, as per :c:member:`HS_FLAG_MULTILINE`.
    * :regexp:`s`: Interpret ``.`` as "any character", as per
      :c:member:`HS_FLAG_DOTALL`.
    * :regexp:`x`: Extended syntax, which will ignore most whitespace in the
      pattern for compatibility with libpcre's ``PCRE_EXTENDED`` option.
  For example, the expression :regexp:`foo(?i)bar(?-i)baz` will switch on
  case-insensitive matching *only* for the ``bar`` portion of the match.
 * The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and
  'not word boundary', respectively).
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@ -44,7 +44,7 @@ master_doc = 'index'
 # General information about the project.
 project = u'Hyperscan'
-copyright = u'2015, Intel Corporation'
+copyright = u'2015-2016, Intel Corporation'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
--- a/doc/dev-reference/copyright.rst
+++ b/doc/dev-reference/copyright.rst
@ -30,4 +30,4 @@ and/or other countries.
 \*Other names and brands may be claimed as the property of others.
-Copyright |copy| 2015, Intel Corporation. All rights reserved.
+Copyright |copy| 2015-2016, Intel Corporation. All rights reserved.
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@ -15,6 +15,7 @@ Hyperscan |version| Developer's Reference Guide
   getting_started
   compilation
   runtime
   serialization
   performance
   api_constants
   api_files
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@ -124,13 +124,19 @@ databases, only a single scratch region is necessary: in this case, calling
 will ensure that the scratch space is large enough to support scanning against
 any of the given databases.
-Importantly, only one such space is required per thread and can (and indeed
+While the Hyperscan library is re-entrant, the use of scratch spaces is not.
-should) be allocated before data scanning is to commence. In a scenario where a
+For example, if by design it is deemed necessary to run recursive or nested
-set of expressions are compiled by a single "master" thread and data will be
+scanning (say, from the match callback function), then an additional scratch
-scanned by multiple "worker" threads, the convenience function
+space is required for that context.
-:c:func:`hs_clone_scratch` allows multiple copies of an existing scratch space
+
-to be made for each thread (rather than forcing the caller to pass all the
+In the absence of recursive scanning, only one such space is required per thread
-compiled databases through :c:func:`hs_alloc_scratch` multiple times).
+and can (and indeed should) be allocated before data scanning is to commence.
 In a scenario where a set of expressions are compiled by a single "master"
 thread and data will be scanned by multiple "worker" threads, the convenience
 function :c:func:`hs_clone_scratch` allows multiple copies of an existing
 scratch space to be made for each thread (rather than forcing the caller to pass
 all the compiled databases through :c:func:`hs_alloc_scratch` multiple times).
 For example:
@ -163,14 +169,6 @@ For example:
    /* Now two threads can both scan against database db,
       each with its own scratch space. */
 While the Hyperscan library is re-entrant, the use of scratch spaces is not.
 For example, if by design it is deemed necessary to run recursive or nested
 scanning (say, from the match callback function), then an additional scratch
 space is required for that context.
 The easiest way to achieve this is to build up a single scratch space as a
 prototype, then clone it for each context:
 *****************
 Custom Allocators
 *****************
--- a/doc/dev-reference/serialization.rst
+++ b/doc/dev-reference/serialization.rst
@ -0,0 +1,67 @@
 .. _serialization:
 #############
 Serialization
 #############
 For some applications, compiling Hyperscan pattern databases immediately prior
 to use is not an appropriate design. Some users may wish to:
 * Compile pattern databases on a different host;
 * Persist compiled databases to storage and only re-compile pattern databases
  when the patterns change;
 * Control the region of memory in which the compiled database is located.
 Hyperscan pattern databases are not completely flat in memory: they contain
 pointers and have specific alignment requirements. Therefore, they cannot be
 copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
 provides functionality for serializing and deserializing compiled pattern
 databases.
 The API provides the following functions:
 #. :c:func:`hs_serialize_database`: serializes a pattern database into a
   flat relocatable buffer of bytes.
 #. :c:func:`hs_deserialize_database`: reconstructs a newly allocated pattern
   database from the output of :c:func:`hs_serialize_database`.
 #. :c:func:`hs_deserialize_database_at`: reconstructs a pattern
   database at a given memory location from the output of
   :c:func:`hs_serialize_database`.
 #. :c:func:`hs_serialized_database_size`: given a serialized pattern database,
   returns the size of the memory block required by the database when
   deserialized.
 #. :c:func:`hs_serialized_database_info`: given a serialized pattern database,
   returns a string containing information about the database. This call is
   analogous to :c:func:`hs_database_info`.
 .. note:: Hyperscan performs both version and platform compatibility checks
   upon deserialization. The :c:func:`hs_deserialize_database` and
   :c:func:`hs_deserialize_database_at` functions will only permit the
   deserialization of databases compiled with (a) the same version of Hyperscan
   and (b) platform features supported by the current host platform. See
   :ref:`instr_specialization` for more information on platform specialization.
 ===================
 The Runtime Library
 ===================
 The main Hyperscan library (``libhs``) contains both the compiler and runtime
 portions of the library. This means that in order to support the Hyperscan
 compiler, which is written in C++, it requires C++ linkage and has a
 dependency on the C++ standard library.
 Many embedded applications require only the scanning ("runtime") portion of the
 Hyperscan library. In these cases, pattern compilation generally takes place on
 another host, and serialized pattern databases are delivered to the application
 for use.
 To support these applications without requiring the C++ dependency, a
 runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
 distributed. This library does not depend on the C++ standard library and
 provides all Hyperscan functions other that those used to compile databases.
--- a/libhs.pc.in
+++ b/libhs.pc.in
@ -7,4 +7,5 @@ Name: libhs
 Description: Intel(R) Hyperscan Library
 Version: @HS_VERSION@
 Libs: -L${libdir} -lhs
 Libs.private: @PRIVATE_LIBS@
 Cflags: -I${includedir}/hs
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@ -1,39 +0,0 @@
 # The set of rules and other nastiness for generating FDR/Teddy source
 # we need to add these as explicit dependencies
 set(AUTOGEN_PY_FILES
    arch.py
    autogen.py
    autogen_utils.py
    base_autogen.py
    fdr_autogen.py
    teddy_autogen.py
 )
 function(fdr_autogen type out)
    add_custom_command (
        COMMENT "AUTOGEN ${out}"
        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}"
        COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}"
        DEPENDS ${AUTOGEN_PY_FILES}
        )
    add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}")
 endfunction(fdr_autogen)
 #now build the functions
 fdr_autogen(runtime fdr_autogen.c)
 fdr_autogen(compiler fdr_autogen_compiler.cpp)
 fdr_autogen(teddy_runtime teddy_autogen.c)
 fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
 set(fdr_GENERATED_SRC
    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
    PARENT_SCOPE)
 set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
--- a/src/fdr/arch.py
+++ b/src/fdr/arch.py
@ -1,58 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import autogen_utils
 # wrapper for architectures
 class Arch:
    def __init__(self, name, extensions = []):
        self.name = name
        self.extensions = extensions
        self.target = None
    def get_guard(self):
        # these defines definitely fall into the "belt-and-suspenders"
        # category of paranoia
        if (self.guard_list == []):
            return "#if 1"
        return "#if " + " && ".join(self.guard_list)
 class X86Arch(Arch):
    def __init__(self, name, extensions = []):
        Arch.__init__(self, name, extensions)
        self.guard_list = [ ]
        self.target = "0"
        if "AVX2" in extensions:
            self.target += " | HS_CPU_FEATURES_AVX2"
            self.guard_list += [ "defined(__AVX2__)" ]
 arch_x86_64            = X86Arch("x86_64", extensions = [ ])
 arch_x86_64_avx2       = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@ -1,154 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import sys
 from autogen_utils import *
 from fdr_autogen import *
 from teddy_autogen import *
 from arch import *
 # FDR setup
 # these are either produced - if the guard succeeds, or #defined to zeroes.
 # either the function or the zero is fine in our array of function pointers
 def produce_fdr_runtimes(l):
    for m in l:
        m.produce_code()
 def produce_fdr_compiles(l):
    print "void getFdrDescriptions(vector<FDREngineDescription> *out) {"
    print "    static const FDREngineDef defns[] = {"
    for m in l:
        m.produce_compile_call()
    print "    };"
    print "    out->clear();"
    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
    print "        out->push_back(FDREngineDescription(defns[i]));"
    print "    }"
    print "}"
 def build_fdr_matchers():
    all_matchers = [ ]
    strides = [ 1, 2, 4 ]
    common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
    for s in strides:
        all_matchers += [ M3(stride = s, **common) ]
    return all_matchers
 # teddy setup
 def build_teddy_matchers():
    all_matchers = [ ]
    # AVX2
    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
    for n_msk in range(1, 5):
        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
    # SSE/SSE2/SSSE3
    for n_msk in range(1, 5):
        all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
        all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
    return all_matchers
 def produce_teddy_compiles(l):
    print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
    print "    static const TeddyEngineDef defns[] = {"
    for m in l:
        m.produce_compile_call()
    print "    };"
    print "    out->clear();"
    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
    print "        out->push_back(TeddyEngineDescription(defns[i]));"
    print "    }"
    print "}"
 # see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
 # are linked. So we either generate the function or we don't - then at the point of the
 # header in fdr_autogen.c we either generate the header or we #define the zero.
 def produce_teddy_runtimes(l):
    # Since we're using -Wmissing-prototypes, we need headers first.
    for m in l:
 	m.produce_guard()
        print m.produce_header(visible = True, header_only = True)
 	m.close_guard()
    for m in l:
 	m.produce_guard()
        m.produce_code()
 	m.close_guard()
 # see produce_teddy_runtimes() comment for the rationale
 def produce_teddy_headers(l):
    for m in l:
 	m.produce_guard()
        print m.produce_header(visible = True, header_only = True)
 	m.produce_zero_alternative()
 # general utilities
 def make_fdr_function_pointers(matcher_list):
    print  """
 typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
 static FDRFUNCTYPE funcs[] = {
 """
    all_funcs = ",\n".join([ "    %s" % m.get_name() for m in matcher_list ])
    print all_funcs
    print """
 };
 """
 def assign_ids(matcher_list, next_id):
    for m in matcher_list:
        m.id = next_id
        next_id += 1
    return next_id
 # Main entry point
 m = build_fdr_matchers()
 next_id = assign_ids(m, 0)
 tm = build_teddy_matchers()
 next_id = assign_ids(tm, next_id)
 if sys.argv[1] == "compiler":
    produce_fdr_compiles(m)
 elif sys.argv[1] == "runtime":
    produce_fdr_runtimes(m)
    produce_teddy_headers(tm)
    make_fdr_function_pointers(m+tm)
 elif sys.argv[1] == "teddy_runtime":
    produce_teddy_runtimes(tm)
 elif sys.argv[1] == "teddy_compiler":
    produce_teddy_compiles(tm)
--- a/src/fdr/autogen_utils.py
+++ b/src/fdr/autogen_utils.py
@ -1,285 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import sys
 def fail_out(msg = ""):
    print >>sys.stderr, "Internal failure in autogen.py: " + msg
    sys.exit(1)
 class IntegerType:
    def __init__(self, size):
        self.size = size
    def get_name(self):
        return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
    def size_in_bytes(self):
        return self.size / 8
    def isSIMDOnIntel(self):
        return False
    def zero_expression(self):
        return "0"
    def constant_to_string(self, n):
        if self.size == 64:
            suffix = "ULL"
        else:
            suffix = ""
        return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
    def lowbits(self, n):
        return (1 << n) - 1
    def highbits(self, n):
        return ~(self.lowbits(self.size - n))
    def lowbit_mask(self, n):
        return self.constant_to_string(self.lowbits(n))
    def highbit_mask(self, n):
        return self.constant_to_string(self.highbits(n))
    def lowbit_extract_expr(self, expr_string, n):
         return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
    def highbit_extract_expr(self, expr_string, n):
        return "(%s >> %d)" % (expr_string, self.size - n)
    def flip_lowbits_expr(self, expr_string, n):
         return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
    def bit_extract_expr(self, expr_string, low, high):
        lbm = self.lowbit_mask(high - low)
        return "((%s >> %d) & %s)" % (expr_string, low, lbm)
    # shifts are +ve if left and -ve if right
    def shift_expr(self, expr_string, n):
        if n <= -self.size or n >= self.size:
            return self.zero_expression()
        elif (n > 0):
            return "(%s << %d)" % (expr_string, n)
        elif (n < 0):
            return "(%s >> %d)" % (expr_string, -n)
        else:
            return "(%s)" % (expr_string)
    # code is:
    # "normal" (always between buf and len) - the default
    # "aligned" (means normal + aligned to a natural boundary)
    # "cautious_forward" (means may go off the end of buf+len)
    # "cautious_backwards" (means may go off the start of buf)
    # "cautious_everywhere" (means may go off both)
    def load_expr_data(self, offset = 0, code = "normal",
                       base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"):
        if code is "normal":
            return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
        elif code is "aligned":
            if self.size is 8:
                fail_out("no aligned byte loads")
            return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
        elif code is "cautious_forward":
            return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
        elif code is "cautious_backward":
            return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
        elif code is "cautious_everywhere":
            return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
 class SIMDIntegerType(IntegerType):
    def __init__(self, size):
        IntegerType.__init__(self, size)
    def isSIMDOnIntel(self):
        return True
    def zero_expression(self):
        return "zeroes128()"
    def lowbit_extract_expr(self, expr_string, n):
        if (n <= 32):
            tmpType = IntegerType(32)
            tmpExpr = "movd(%s)" % expr_string
        elif (32 < n <= 64):
            tmpType = IntegerType(64)
            tmpExpr = "movq(%s)" % expr_string
        return tmpType.lowbit_extract_expr(tmpExpr, n)
    def highbit_extract_expr(self, expr_string, n):
        fail_out("Unimplemented high bit extract on m128")
    def bit_extract_expr(self, expr_string, low, high, flip):
        fail_out("Unimplemented bit extract on m128")
    def shift_expr(self, expr_string, n):
        if n % 8 != 0:
            fail_out("Trying to shift a m128 by a bit granular value")
        # should check that n is divisible by 8
        if n <= -self.size or n >= self.size:
            return self.zero_expression()
        elif (n > 0):
            return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8)
        elif (n < 0):
            return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8)
        else:
            return "(%s)" % (expr_string)
    def lowbit_mask(self, n):
        if n % 8 != 0:
            fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
        return self.shift_expr("ones128()", -(128 - n))
 def getRequiredType(bits):
    if bits == 128:
        return SIMDIntegerType(bits)
    for b in [ 8, 16, 32, 64]:
        if (bits <= b):
            return IntegerType(b)
    return None
 class IntegerVariable:
    def __init__(self, name, type):
        self.name = name
        self.type = type
    def gen_initializer_stmt(self, initialization_string = None):
        if initialization_string:
            return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string)
        else:
            return "%s %s;" % (self.type.get_name(), self.name)
 class Step:
    def __init__(self, context, offset = 0):
        self.context = context
        self.matcher = context.matcher
        self.offset = offset
        self.latency = 1
        self.dependency_list = []
        self.latest = None
        self.context.add_step(self)
    # return a string, complete with indentation
    def emit(self):
        indent = " " * (self.offset*2 + self.matcher.default_body_indent)
        s = "\n".join( [ indent + line for line in self.val.split("\n")] )
        if self.latest:
            s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest
            if self.dependency_list:
                s += " Derps: "
                for (d,l) in self.dependency_list:
                    s += "%d/%d " % (d.debug_step,l)
        return s
    def add_dependency(self, step, anti_dependency = False, output_dependency = False):
        if anti_dependency or output_dependency:
            self.dependency_list += [ (step, 1) ]
        else:
            self.dependency_list += [ (step, step.latency) ]
    def nv(self, type, var_name):
        return self.context.new_var(self, type, var_name)
    def gv(self, var_name, reader = True, writer = False):
        return self.context.get_var(self, var_name, reader = reader, writer = writer)
 # utility steps, generic
 class LabelStep(Step):
    def __init__(self, context, offset = 0, label_prefix = "off"):
        Step.__init__(self, context, offset)
        self.val = "%s%d: UNUSED;" % (label_prefix, offset)
 class OpenScopeStep(Step):
    def __init__(self, context, offset = 0):
        Step.__init__(self, context, offset)
        self.val = "{"
 class CloseScopeStep(Step):
    def __init__(self, context, offset = 0):
        Step.__init__(self, context, offset)
        self.val = "}"
 class CodeGenContext:
    def __init__(self, matcher):
        self.vars = {}
        self.steps = []
        self.ctr = 0
        self.matcher = matcher
        self.var_writer = {} # var to a single writer
        self.var_readers = {} # var to a list of all the readers that read the last value
    def new_var(self, step, type, var_name):
        var = IntegerVariable(var_name, type)
        self.vars[var_name] = var
        self.var_writer[var_name] = step
        return var
    def get_var(self, step, var_name, reader = True, writer = False):
        if reader:
            writer_step = self.var_writer[var_name]
            if writer_step:
                step.add_dependency(writer_step)
            self.var_readers.setdefault(var_name, []).append(step)
        if writer and not reader:
            if self.var_writer[var_name]:
                step.add_dependency(self.var_writer[var_name], output_dependency = True)
        if writer:
            if self.var_readers.has_key(var_name):
                for reader in [ r for r in self.var_readers[var_name] if r is not step ]:
                    step.add_dependency(reader, anti_dependency = True)
                self.var_readers[var_name] = []
            self.var_writer[var_name] = step
        return self.vars[var_name]
    def add_step(self, step):
        self.steps += [ step ]
        step.debug_step = self.ctr
        self.ctr += 1
    def dontschedule(self, finals):
        return "\n".join( [ s.emit() for s in self.steps ] )
    def schedule(self, finals):
        for f in finals:
            f.latest = f.latency
        worklist = finals
        while worklist:
            current = worklist[0]
            worklist = worklist[1:]
            for (dep, lat) in current.dependency_list:
                if dep.latest is None or dep.latest < (current.latest + dep.latency):
                    dep.latest = current.latest + lat
                    if dep not in worklist:
                        worklist += [ dep ]
        self.steps.sort(reverse = True, key = lambda s : s.latest)
        return "\n".join( [ s.emit() for s in self.steps ] )
--- a/src/fdr/base_autogen.py
+++ b/src/fdr/base_autogen.py
@ -1,167 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import sys
 from autogen_utils import *
 from base_autogen import *
 from string import Template
 class MatcherBase:
    def __init__(self):
        pass
    def get_name(self):
        return "fdr_exec_%03d" % self.id
    def produce_header(self, visible, header_only = False):
        s = ""
        if not visible:
            s += "static never_inline"
        s += """
 hwlm_error_t %s(UNUSED const struct FDR *fdr,
                UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
        if header_only:
            s += ";"
        else:
            s += "{"
        s += "\n"
        return s
    def produce_guard(self):
 	print self.arch.get_guard()
    def produce_zero_alternative(self):
 	print """
 #else
 #define %s 0
 #endif
 """ % self.get_name()
    # trivial function for documentation/modularity
    def close_guard(self):
 	print "#endif"
    def produce_common_declarations(self):
        return """
    const u8 * buf = a->buf;
    const size_t len = a->len;
    const u8 * ptr = buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t * control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 * tryFloodDetect = a->firstFloodDetect;
    UNUSED u32 bit, bitRem, confSplit, idx;
    u32 byte, cf;
    const struct FDRConfirm *fdrc;
    u32 last_match = (u32)-1;
 """
    def produce_continue_check(self):
        return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
    *a->groups = controlVal;
    return HWLM_TERMINATED;
 }
 """
    def produce_flood_check(self):
        return """
        if (P0(ptr > tryFloodDetect)) {
            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
                *a->groups = controlVal;
                return HWLM_TERMINATED;
            }
        }
 """
    def produce_footer(self):
        return """
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 """
    def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
        if cautious:
            caution_string = "VECTORING"
        else:
            caution_string = "NOT_CAUTIOUS"
        conf_split_mask = IntegerType(32).constant_to_string(
                            self.conf_top_level_split - 1)
        if enable_confirmless:
            quick_check_string = """
        if (!fdrc->mult) {
            u32 id = fdrc->nBitsOrSoleID;
            if ((last_match == id) && (fdrc->flags & NoRepeat))
                continue;
           last_match = id;
           controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
           continue;
        } """
        else:
            quick_check_string = ""
        if do_bailout:
            bailout_string = """
        if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
        else:
            bailout_string = ""
        return Template("""
 if (P0(!!$CONFVAR)) {
    do  {
        bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
        byte  = bit / $NUM_BUCKETS + $OFFSET;
        bitRem  = bit % $NUM_BUCKETS;
        $BAILOUT_STRING
        confSplit = *(ptr+byte) & $SPLIT_MASK;
        idx = confSplit * $NUM_BUCKETS + bitRem;
        cf = confBase[idx];
        if (!cf)
            continue;
        fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control))
            continue;
        $QUICK_CHECK_STRING
        confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match);
    } while(P0(!!$CONFVAR));
    if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
        *a->groups = controlVal;
        return HWLM_TERMINATED;
    }
 }""").substitute(CONFVAR = conf_var_name,
                 CONFVAR_SIZE = conf_var_size,
                 NUM_BUCKETS = self.num_buckets,
                 OFFSET = offset,
                 SPLIT_MASK = conf_split_mask,
                 QUICK_CHECK_STRING = quick_check_string,
                 BAILOUT_STRING = bailout_string,
                 CAUTION_STRING = caution_string,
                 CONF_PULL_BACK = self.conf_pull_back)
 def indent(block, depth):
    return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] )
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -55,6 +55,7 @@ public:
    u32 getNumBuckets() const { return numBuckets; }
    u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
    u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
    void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
    bool isValidOnTarget(const target_t &target_in) const;
    virtual u32 getDefaultFloodSuffixLength() const = 0;
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,34 +26,790 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "util/simd_utils.h"
 #define P0(cnd) unlikely(cnd)
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "teddy_internal.h"
 #include "flood_runtime.h"
 #include "fdr_confirm.h"
 #include "fdr_confirm_runtime.h"
-#include "fdr_streaming_runtime.h"
+#include "fdr_internal.h"
 #include "fdr_loadval.h"
-#include "fdr_autogen.c"
+#include "fdr_streaming_runtime.h"
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
 /** \brief total zone buffer size */
 #define ZONE_TOTAL_SIZE     64
 /** \brief maximum number of allowed zones */
 #define ZONE_MAX            3
 /** \brief zone information.
 *
 * Zone represents a region of data to scan in FDR.
 *
 * The incoming buffer is to split in multiple zones to ensure two properties:
 * 1: that we can read 8? bytes behind to generate a hash safely
 * 2: that we can read the byte after the current byte (domain > 8)
 */
 struct zone {
    /** \brief copied buffer, used only when it is a boundary zone. */
    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
    /** \brief shift amount for fdr state to avoid unwanted match. */
    u8 shift;
    /** \brief if boundary zone, start points into the zone buffer after the
     * pre-padding. Otherwise, points to the main buffer, appropriately. */
    const u8 *start;
    /** \brief if boundary zone, end points to the end of zone. Otherwise,
     * pointer to the main buffer, appropriately. */
    const u8 *end;
    /** \brief the amount to adjust to go from a pointer in the zones region
     * (between start and end) to a pointer in the original data buffer. */
    ptrdiff_t zone_pointer_adjust;
    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
     * otherwise end of the zone buf. floodPtr always points inside the same
     * buffer as the start pointe. */
    const u8 *floodPtr;
 };
 static
 const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 /* generates an initial state mask based on the last byte-ish of history rather
 * than being all accepting. If there is no history to consider, the state is
 * generated based on the minimum length of each bucket in order to prevent
 * confirms.
 */
 static really_inline
 m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
                  const struct zone *z) {
    m128 s;
    if (len_history) {
        /* +1: the zones ensure that we can read the byte at z->end */
        u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
        tmp &= fdr->domainMask;
        s = *((const m128 *)ft + tmp);
        s = shiftRight8Bits(s);
    } else {
        s = fdr->start;
    }
    return s;
 }
 static really_inline
 void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
                       u64a *conf8, m128 *s) {
    /* +1: the zones ensure that we can read the byte at z->end */
    u64a current_data_0;
    u64a current_data_8;
    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
    u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) &
               domain_mask_adjusted;
    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
    u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted;
    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
    u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted;
    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
    u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted;
    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
    u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) &
               domain_mask_adjusted;
    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
    u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted;
    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
    u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted;
    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
    u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted;
    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
    m128 st0 = *(const m128 *)(ft + v0*8);
    m128 st1 = *(const m128 *)(ft + v1*8);
    m128 st2 = *(const m128 *)(ft + v2*8);
    m128 st3 = *(const m128 *)(ft + v3*8);
    m128 st4 = *(const m128 *)(ft + v4*8);
    m128 st5 = *(const m128 *)(ft + v5*8);
    m128 st6 = *(const m128 *)(ft + v6*8);
    m128 st7 = *(const m128 *)(ft + v7*8);
    m128 st8 = *(const m128 *)(ft + v8*8);
    m128 st9 = *(const m128 *)(ft + v9*8);
    m128 st10 = *(const m128 *)(ft + v10*8);
    m128 st11 = *(const m128 *)(ft + v11*8);
    m128 st12 = *(const m128 *)(ft + v12*8);
    m128 st13 = *(const m128 *)(ft + v13*8);
    m128 st14 = *(const m128 *)(ft + v14*8);
    m128 st15 = *(const m128 *)(ft + v15*8);
    st1 = byteShiftLeft128(st1, 1);
    st2 = byteShiftLeft128(st2, 2);
    st3 = byteShiftLeft128(st3, 3);
    st4 = byteShiftLeft128(st4, 4);
    st5 = byteShiftLeft128(st5, 5);
    st6 = byteShiftLeft128(st6, 6);
    st7 = byteShiftLeft128(st7, 7);
    st9 = byteShiftLeft128(st9, 1);
    st10 = byteShiftLeft128(st10, 2);
    st11 = byteShiftLeft128(st11, 3);
    st12 = byteShiftLeft128(st12, 4);
    st13 = byteShiftLeft128(st13, 5);
    st14 = byteShiftLeft128(st14, 6);
    st15 = byteShiftLeft128(st15, 7);
    *s = or128(*s, st0);
    *s = or128(*s, st1);
    *s = or128(*s, st2);
    *s = or128(*s, st3);
    *s = or128(*s, st4);
    *s = or128(*s, st5);
    *s = or128(*s, st6);
    *s = or128(*s, st7);
    *conf0 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf0 ^= ~0ULL;
    *s = or128(*s, st8);
    *s = or128(*s, st9);
    *s = or128(*s, st10);
    *s = or128(*s, st11);
    *s = or128(*s, st12);
    *s = or128(*s, st13);
    *s = or128(*s, st14);
    *s = or128(*s, st15);
    *conf8 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf8 ^= ~0ULL;
 }
 static really_inline
 void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
                       u64a *conf8, m128 *s) {
    u64a current_data_0;
    u64a current_data_8;
    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
    m128 st0 = *(const m128 *)(ft + v0*8);
    m128 st2 = *(const m128 *)(ft + v2*8);
    m128 st4 = *(const m128 *)(ft + v4*8);
    m128 st6 = *(const m128 *)(ft + v6*8);
    m128 st8 = *(const m128 *)(ft + v8*8);
    m128 st10 = *(const m128 *)(ft + v10*8);
    m128 st12 = *(const m128 *)(ft + v12*8);
    m128 st14 = *(const m128 *)(ft + v14*8);
    st2 = byteShiftLeft128(st2, 2);
    st4 = byteShiftLeft128(st4, 4);
    st6 = byteShiftLeft128(st6, 6);
    st10 = byteShiftLeft128(st10, 2);
    st12 = byteShiftLeft128(st12, 4);
    st14 = byteShiftLeft128(st14, 6);
    *s = or128(*s, st0);
    *s = or128(*s, st2);
    *s = or128(*s, st4);
    *s = or128(*s, st6);
    *conf0 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf0 ^= ~0ULL;
    *s = or128(*s, st8);
    *s = or128(*s, st10);
    *s = or128(*s, st12);
    *s = or128(*s, st14);
    *conf8 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf8 ^= ~0ULL;
 }
 static really_inline
 void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
                       u64a *conf8, m128 *s) {
    u64a current_data_0;
    u64a current_data_8;
    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
    m128 st0 = *(const m128 *)(ft + v0*8);
    m128 st4 = *(const m128 *)(ft + v4*8);
    m128 st8 = *(const m128 *)(ft + v8*8);
    m128 st12 = *(const m128 *)(ft + v12*8);
    st4 = byteShiftLeft128(st4, 4);
    st12 = byteShiftLeft128(st12, 4);
    *s = or128(*s, st0);
    *s = or128(*s, st4);
    *conf0 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf0 ^= ~0ULL;
    *s = or128(*s, st8);
    *s = or128(*s, st12);
    *conf8 = movq(*s);
    *s = byteShiftRight128(*s, 8);
    *conf8 ^= ~0ULL;
 }
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id,
                    struct zone *z) {
    const u8 bucket = 8;
    const u8 pullback = 1;
    if (likely(!*conf)) {
        return;
    }
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
    const u8 *confLoc = ptr;
    do  {
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
        u32 confSplit = *(ptr + byte);
        u32 idx = confSplit * bucket + bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        if (!fdrc->mult) {
            u32 id = fdrc->nBitsOrSoleID;
            if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) {
                continue;
            }
           *last_match_id = id;
           *controlVal = a->cb(ptr_main + byte - a->buf,
                               ptr_main + byte - a->buf, id, a->ctxt);
           continue;
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback,
                    control, last_match_id, confVal);
    } while (unlikely(!!*conf));
 }
 static really_inline
 void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
 #ifdef DEBUG
    DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
    DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
                 z->start, z->end, z->shift);
    DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
                 z->zone_pointer_adjust, z->floodPtr);
    DEBUG_PRINTF("zone buf:");
    for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
        if (i % 8 == 0) {
            printf("_");
        }
        if (z->buf[i]) {
            printf("%02x", z->buf[i]);
        } else {
            printf("..");
        }
    }
    printf("\n");
 #endif
 };
 /**
 * \brief Updates attributes for non-boundary region zone.
 */
 static really_inline
 void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
                    struct zone *z) {
    z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
    z->start = begin;
    z->end = end;
    z->floodPtr = flood;
    z->shift = 0;
 }
 /**
 * \brief Create zone for short cases (<= ITER_BYTES).
 *
 * For this case we need to copy everything into the zone's internal buffer.
 *
 * We need to ensure that we run over real data if it exists (in history or
 * before zone begin). We also need to ensure 8 bytes before any data being
 * matched can be read (to perform a conf hash).
 *
 * We also need to ensure that the data at z->end can be read.
 *
 * Hence, the zone consists of:
 *     16 bytes of history,
 *     1 - 24 bytes of data form the buffer (ending at end),
 *     1 byte of final padding
 */
 static really_inline
 void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
                     const u8 *end, struct zone *z) {
    /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
     * the checks in boundary zone. */
    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
    ptrdiff_t z_len = end - begin;
    assert(z_len > 0);
    assert(z_len <= ITER_BYTES);
    z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
    static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
    /* we are guaranteed to always have 16 initialised bytes at the end of
     * the history buffer (they may be garbage coming from the stream state
     * preceding hbuf, but bytes that don't correspond to actual history
     * shouldn't affect computations). */
    *(m128 *)z->buf = loadu128(hend - sizeof(m128));
    /* The amount of data we have to copy from main buffer. */
    size_t copy_len = MIN((size_t)(end - buf),
                          ITER_BYTES + sizeof(CONF_TYPE));
    u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
    switch (copy_len) {
    case 1:
        *zone_data = *(end - 1);
        break;
    case 2:
        *(u16 *)zone_data = unaligned_load_u16(end - 2);
        break;
    case 3:
        *(u16 *)zone_data = unaligned_load_u16(end - 3);
        *(zone_data + 2) = *(end - 1);
        break;
    case 4:
        *(u32 *)zone_data = unaligned_load_u32(end - 4);
        break;
    case 5:
    case 6:
    case 7:
        /* perform copy with 2 overlapping 4-byte chunks from buf. */
        *(u32 *)zone_data = unaligned_load_u32(end - copy_len);
        unaligned_store_u32(zone_data + copy_len - sizeof(u32),
                            unaligned_load_u32(end - sizeof(u32)));
        break;
    case 8:
        *(u64a *)zone_data = unaligned_load_u64a(end - 8);
        break;
    case 9:
    case 10:
    case 11:
    case 12:
    case 13:
    case 14:
    case 15:
        /* perform copy with 2 overlapping 8-byte chunks from buf. */
        *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
        unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
                             unaligned_load_u64a(end - sizeof(u64a)));
        break;
    case 16:
        /* copy 16-bytes from buf. */
        *(m128 *)zone_data = loadu128(end - 16);
        break;
    default:
        assert(copy_len <= sizeof(m128) + sizeof(u64a));
        /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
         */
        *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
        storeu128(zone_data + copy_len - sizeof(m128),
                  loadu128(end - sizeof(m128)));
        break;
    }
    /* set the start and end location of the zone buf
     * to be scanned */
    u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
    assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
    /* copy the post-padding byte; this is required for domain > 8 due to
     * overhang */
    *z_end = 0;
    z->end = z_end;
    z->start = z_end - ITER_BYTES;
    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
    assert(z->start + z->shift == z_end - z_len);
 }
 /**
 * \brief Create a zone for the start region.
 *
 * This function requires that there is > ITER_BYTES of data in the buffer to
 * scan. The start zone itself is always responsible for scanning exactly
 * ITER_BYTES of data - there are no warmup/junk bytes scanned.
 *
 * This zone ensures that the byte at z->end can be read and corresponds to
 * the next byte of data.
 *
 * 8 bytes of history data are provided before z->start to allow proper hash
 * generation in streaming mode. If buf != begin, upto 8 bytes of data
 * prior to begin is also provided.
 *
 * Although we are not interested in bare literals which start before begin
 * if buf != begin, lookarounds associated with the literal may require
 * the data prior to begin for hash purposes.
 */
 static really_inline
 void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
                     struct zone *z) {
    assert(ITER_BYTES == sizeof(m128));
    assert(sizeof(CONF_TYPE) == 8);
    static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
    const u8 *end = begin + ITER_BYTES;
    /* set floodPtr to the end of zone buf to avoid checks in start zone */
    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
    z->shift = 0; /* we are processing ITER_BYTES of real data */
    /* we are guaranteed to always have 16 initialised bytes at the end of the
     * history buffer (they may be garbage coming from the stream state
     * preceding hbuf, but bytes that don't correspond to actual history
     * shouldn't affect computations). However, for start zones, history is only
     * required for conf hash purposes so we only need 8 bytes */
    unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
    /* The amount of data we have to copy from main buffer. */
    size_t copy_len = MIN((size_t)(end - buf),
                          ITER_BYTES + sizeof(CONF_TYPE));
    assert(copy_len >= 16);
    /* copy the post-padding byte; this is required for domain > 8 due to
     * overhang. The start requires that there is data after the zone so it
     * it safe to dereference end */
    z->buf[ZONE_START_BEGIN + copy_len] = *end;
    /* set the start and end location of the zone buf to be scanned */
    u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
    z->end = z_end;
    z->start = z_end - ITER_BYTES;
    /* copy the first 8 bytes of the valid region */
    unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
                         unaligned_load_u64a(end - copy_len));
    /* copy the last 16 bytes, may overlap with the previous 8 byte write */
    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
 }
 /**
 * \brief Create a zone for the end region.
 *
 * This function requires that there is > ITER_BYTES of data in the buffer to
 * scan. The end zone, however, is only responsible for a scanning the <=
 * ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES
 * iteration as the main loop cannot handle the last byte of the buffer.
 *
 * This zone ensures that the byte at z->end can be read by filling it with a
 * padding character.
 *
 * Upto 8 bytes of data prior to begin is also provided for the purposes of
 * generating hashes. History is not copied, as all locations which require
 * history for generating a hash are the responsiblity of the start zone.
 */
 static really_inline
 void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
                   struct zone *z) {
    /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
     * the checks in boundary zone. */
    z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
    ptrdiff_t z_len = end - begin;
    assert(z_len > 0);
    assert(z_len <= ITER_BYTES);
    z->shift = ITER_BYTES - z_len;
    /* The amount of data we have to copy from main buffer. */
    size_t copy_len = MIN((size_t)(end - buf),
                          ITER_BYTES + sizeof(CONF_TYPE));
    assert(copy_len >= 16);
    /* copy the post-padding byte; this is required for domain > 8 due to
     * overhang */
    z->buf[copy_len] = 0;
    /* set the start and end location of the zone buf
     * to be scanned */
    u8 *z_end = z->buf + copy_len;
    z->end = z_end;
    z->start = z_end - ITER_BYTES;
    assert(z->start + z->shift == z_end - z_len);
    /* copy the first 8 bytes of the valid region */
    unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len));
    /* copy the last 16 bytes, may overlap with the previous 8 byte write */
    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
 }
 /**
 * \brief Prepare zones.
 *
 * This function prepares zones with actual buffer and some padded bytes.
 * The actual ITER_BYTES bytes in zone is preceded by main buf and/or
 * history buf and succeeded by padded bytes possibly from main buf,
 * if available.
 */
 static really_inline
 size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
                    size_t start, const u8 *flood, struct zone *zoneArr) {
    const u8 *ptr = buf + start;
    size_t remaining = len - start;
    if (remaining <= ITER_BYTES) {
        /* enough bytes to make only one zone */
        createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
        return 1;
    }
    /* enough bytes to make more than one zone */
    size_t numZone = 0;
    createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
    ptr += ITER_BYTES;
    assert(ptr < buf + len);
    /* find maximum buffer location that the main zone can scan
     * - must be a multiple of ITER_BYTES, and
     * - cannot contain the last byte (due to overhang)
     */
    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES);
    assert(main_end >= ptr);
    /* create a zone if multiple of ITER_BYTES are found */
    if (main_end != ptr) {
        createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
        ptr = main_end;
    }
    /* create a zone with rest of the data from the main buffer */
    createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
    return numZone;
 }
 #define INVALID_MATCH_ID (~0U)
 #define FDR_MAIN_LOOP(zz, s, get_conf_fn)                                   \
    do {                                                                    \
        const u8 *tryFloodDetect = zz->floodPtr;                            \
        const u8 *start_ptr = zz->start;                                    \
        const u8 *end_ptr = zz->end;                                        \
                                                                            \
        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
            itPtr += ITER_BYTES) {                                          \
            if (unlikely(itPtr > tryFloodDetect)) {                         \
                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
                                             &floodBackoff, &controlVal,    \
                                             ITER_BYTES);                   \
                if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {      \
                    return HWLM_TERMINATED;                                 \
                }                                                           \
            }                                                               \
            __builtin_prefetch(itPtr + (ITER_BYTES*4));                     \
            u64a conf0;                                                     \
            u64a conf8;                                                     \
            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted,    \
                        ft, &conf0, &conf8, &s);                            \
            do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr,      \
                           control, &last_match_id, zz);                    \
            do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr,      \
                           control, &last_match_id, zz);                    \
            if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {          \
                return HWLM_TERMINATED;                                     \
            }                                                               \
        } /* end for loop */                                                \
    } while (0)                                                             \
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                             const struct FDR_Runtime_Args *a) {
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    u32 last_match_id = INVALID_MATCH_ID;
    u64a domain_mask_adjusted = fdr->domainMask << 1;
    u8 stride = fdr->stride;
    const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
    const u32 *confBase = (const u32 *)(ft + fdr->tabSize);
    struct zone zones[ZONE_MAX];
    assert(fdr->domain > 8 && fdr->domain < 16);
    size_t numZone = prepareZones(a->buf, a->len,
                                  a->buf_history + a->len_history,
                                  a->start_offset, a->firstFloodDetect, zones);
    assert(numZone <= ZONE_MAX);
    m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
    for (size_t curZone = 0; curZone < numZone; curZone++) {
        struct zone *z = &zones[curZone];
        dumpZoneInfo(z, curZone);
        /* When a zone contains less data than is processed in an iteration
         * of FDR_MAIN_LOOP(), we need to scan over some extra data.
         *
         * We have chosen to scan this extra data at the start of the
         * iteration. The extra data is either data we have already scanned or
         * garbage (if it is earlier than offset 0),
         *
         * As a result we need to shift the incoming state back so that it will
         * properly line up with the data being scanned.
         *
         * We also need to forbid reporting any matches in the data being
         * rescanned as they have already been reported (or are over garbage but
         * later stages should also provide that safety guarantee).
         */
        u8 shift = z->shift;
        state = variable_byte_shift_m128(state, shift);
        state = or128(state, load128(zone_or_mask[shift]));
        switch (stride) {
        case 1:
            FDR_MAIN_LOOP(z, state, get_conf_stride_1);
            break;
        case 2:
            FDR_MAIN_LOOP(z, state, get_conf_stride_2);
            break;
        case 4:
            FDR_MAIN_LOOP(z, state, get_conf_stride_4);
            break;
        default:
            break;
        }
    }
    return HWLM_SUCCESS;
 }
 #if defined(__AVX2__)
 #define ONLY_AVX2(func) func
 #else
 #define ONLY_AVX2(func) NULL
 #endif
 typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
 static const FDRFUNCTYPE funcs[] = {
    fdr_engine_exec,
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
    fdr_exec_teddy_msks1,
    fdr_exec_teddy_msks1_pck,
    fdr_exec_teddy_msks2,
    fdr_exec_teddy_msks2_pck,
    fdr_exec_teddy_msks3,
    fdr_exec_teddy_msks3_pck,
    fdr_exec_teddy_msks4,
    fdr_exec_teddy_msks4_pck,
 };
 #define FAKE_HISTORY_SIZE 16
 static const u8 fake_history[FAKE_HISTORY_SIZE];
-hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start,
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
-                     HWLMCallback cb, void *ctxt, hwlm_group_t groups) {
+                     size_t start, HWLMCallback cb, void *ctxt,
                     hwlm_group_t groups) {
    // We guarantee (for safezone construction) that it is safe to read 16
    // bytes before the end of the history buffer.
    const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
    const struct FDR_Runtime_Args a = {
        buf,
        len,
-        fake_history,
+        hbuf,
        0,
-        fake_history, // nocase
+        hbuf, // nocase
        0,
        start,
        cb,
@ -73,7 +829,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t st
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                              size_t hlen, const u8 *buf, size_t len,
                              size_t start, HWLMCallback cb, void *ctxt,
-                              hwlm_group_t groups, u8 * stream_state) {
+                              hwlm_group_t groups, u8 *stream_state) {
    struct FDR_Runtime_Args a = {
        buf,
        len,
@ -86,9 +842,9 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
        ctxt,
        &groups,
        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
-        hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen)
+        /* we are guaranteed to always have 16 initialised bytes at the end of
-             : (u64a)0
+         * the history buffer (they may be garbage). */
-
+        hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
    };
    fdrUnpackState(fdr, &a, stream_state);
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -43,9 +43,6 @@ extern "C" {
 struct FDR;
 /** \brief Returns size in bytes of the given FDR engine. */
 size_t fdrSize(const struct FDR *fdr);
 /** \brief Returns non-zero if the contents of the stream state indicate that
 * there is active FDR history beyond the regularly used history. */
 u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state);
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@ -1,564 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import sys
 from autogen_utils import *
 from base_autogen import *
 from string import Template
 class OrStep(Step):
    def __init__(self, context, offset, width):
        Step.__init__(self, context, offset)
        s_var = self.gv("st%d" % offset)
        if width < 128:
            self.val = "s |= %s;" % s_var.name
        else:
            self.val = "s = or%d(s, %s);" % (width, s_var.name)
 class ShiftStateStep(Step):
    def __init__(self, context, offset = 0, stride_used = 1):
        Step.__init__(self, context, offset)
        m = self.matcher
        state = m.state_variable
        shift_distance = -1 * stride_used * m.num_buckets
        self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
 class BulkLoadStep(Step):
    def __init__(self, context, offset, size, define_var = True, aligned = True):
        Step.__init__(self, context, offset)
        m = self.matcher
        self.latency = 4
        blt = m.bulk_load_type
        if aligned:
            init_string = blt.load_expr_data(self.offset, code = "aligned")
        else:
            init_string = blt.load_expr_data(self.offset)
        var_name = "current_data_%d" % offset
        if define_var:
            lb_var = self.nv(blt, var_name)
            self.val = lb_var.gen_initializer_stmt(init_string)
        else:
            lb_var = self.gv(var_name, reader = False, writer = True)
            self.val = "%s = %s;" % (var_name, init_string)
 class ValueExtractStep(Step):
    def __init__(self, context, offset, sub_load_cautious = False):
        Step.__init__(self, context, offset)
        m = self.matcher
        self.latency = 2
        dsb = m.datasize_bytes
        modval = offset % dsb
        if modval == dsb - 1:
            # Case 1: reading more than one byte over the end of the bulk load
            self.latency = 4
            if sub_load_cautious:
                code_string = "cautious_forward"
            else:
                code_string = "normal"
            load_string = m.single_load_type.load_expr_data(self.offset, code_string)
            temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
        else:
            # Case 2: reading a value that can be found entirely in the current register
            if m.fdr2_force_naive_load:
                load_string = m.single_load_type.load_expr_data(self.offset, "normal")
                temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
            else:
                lb_var = self.gv("current_data_%d" % (offset - modval))
                if modval == 0:
                    # Case 2a: value is at LSB end of the register and must be left-
                    # shifted into place if there is a "reach_shift_adjust" required
                    temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust)
                else:
                    # Case 2b: value is in the middle of the register and will be
                    # right-shifted into place (adjusted by "reach_shift_adjust")
                    temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
        v_var = self.nv(m.value_extract_type, "v%d" % offset)
        self.val = v_var.gen_initializer_stmt(init_string)
 class TableLookupStep(Step):
    def __init__(self, context, reach_multiplier, offset = 0):
        Step.__init__(self, context, offset)
        m = self.matcher
        self.latency = 4
        v_var = self.gv("v%d" % offset)
        s_var = self.nv(m.state_type, "st%d" % offset)
        init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(),
                                                       v_var.name, reach_multiplier)
        self.val = s_var.gen_initializer_stmt(init_string)
 class ShiftReachMaskStep(Step):
    def __init__(self, context, offset):
        Step.__init__(self, context, offset)
        m = self.matcher
        extr = m.extract_frequency
        modval = offset % extr
        s_var = self.gv("st%d" % offset, writer = True)
        self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets))
 class ConfExtractStep(Step):
    def __init__(self, context, offset):
        Step.__init__(self, context, offset)
        m = self.matcher
        if m.state_type.isSIMDOnIntel():
            self.latency = 2
        init_string = m.state_type.lowbit_extract_expr("s", m.extract_size)
        extr_var = self.nv(m.extr_type, "extr%d" % offset)
        self.val = extr_var.gen_initializer_stmt(init_string)
 class ConfAccumulateStep(Step):
    def __init__(self, context, extract_offset, conf_offset, define_var = True):
        Step.__init__(self, context, extract_offset)
        m = self.matcher
        extr_var = self.gv("extr%d" % extract_offset)
        extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name)
        if extract_offset == conf_offset:
            # create conf_var as a straight copy of extr
            if define_var:
                conf_var = self.nv(m.conf_type, "conf%d" % conf_offset)
                self.val = conf_var.gen_initializer_stmt(extr_var_cast)
            else:
                conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
                self.val = "%s = %s;" % (conf_var.name, extr_var_cast)
        else:
            # shift extr_var and insert/OR it in conf_var
            conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
            shift_dist = (extract_offset - conf_offset) * m.num_buckets
            self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist))
            self.latency = 2
 class ConfirmFlipStep(Step):
    def __init__(self, context, offset):
        Step.__init__(self, context, offset)
        m = self.matcher
        conf_var = self.gv("conf%d" % self.offset, writer = True)
        self.val = "%s = %s;" % (conf_var.name,
                       conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets))
 class ConfirmStep(Step):
    def __init__(self, context, offset, cautious = False):
        Step.__init__(self, context, offset)
        m = self.matcher
        conf_var = self.gv("conf%d" % offset, writer = True)
        self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious,
                                          enable_confirmless = m.stride == 1, do_bailout = False)
 class M3(MatcherBase):
    def produce_compile_call(self):
        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
              self.id, self.state_width, self.num_buckets,
              self.stride,
              self.arch.target, self.conf_pull_back, self.conf_top_level_split)
    def produce_main_loop(self, switch_variant = False):
        stride_offsets = xrange(0, self.loop_bytes, self.stride)
        stride_offsetSet = set(stride_offsets)
        so_steps_last_block = []
        sh = None
        last_confirm = None
        ctxt = CodeGenContext(self)
        if switch_variant:
            print "    ptr -= (iterBytes - dist);"
            print "    { " # need an extra scope around switch variant to stop its globals escaping
        else:
            print "    if (doMainLoop) {"
            print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
            print self.produce_flood_check()
            print "        __builtin_prefetch(ptr + (iterBytes*4));"
            print "        assert(((size_t)ptr % START_MOD) == 0);"
        # just do globally for now
        if switch_variant:
            subsidiary_load_cautious = True
            confirm_cautious = True
        else:
            subsidiary_load_cautious = False
            confirm_cautious = False
        if not self.fdr2_force_naive_load:
            bulk_load_steps = [ off for off in range(self.loop_bytes)
                                if off % self.datasize_bytes == 0 and
                                   (set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)]
        else:
            bulk_load_steps = []
        confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ]
        for off in bulk_load_steps:
            lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off)
            print "        " + lb_var.gen_initializer_stmt()
        for off in confirm_steps:
            var_name = "conf%d" % off
            conf_def_var = ctxt.new_var(None, self.conf_type, var_name)
            if switch_variant:
                init_string = "(%s)-1" % self.conf_type.get_name()
            else:
                init_string = ""
            print "        " + conf_def_var.gen_initializer_stmt(init_string)
        if switch_variant:
            print "        switch(iterBytes - dist) {"
            for i in range(0, self.loop_bytes):
                print "            case %d:" % i
                # init and poison conf; over-precise but harmless
                conf_id = (i / self.confirm_frequency) * self.confirm_frequency
                if i % self.confirm_frequency:
                    conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency))
                    print "                conf%d >>= %d;" % (conf_id, conf_fixup_bits)
                else:
                    print "                conf%d = 0;" % conf_id
                # init state
                state_fixup = i % self.extract_frequency
                state = self.state_variable
                shift_distance = self.num_buckets * state_fixup
                if state_fixup:
                    print "                %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
                    if self.state_width < 128:
                        print "                %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance))
                    else:
                        print "                %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance))
                if not self.fdr2_force_naive_load:
                    # init current_data (could poison it in some cases)
                    load_mod = i % self.datasize_bytes
                    load_offset = i - load_mod
                    if load_mod:
                        # not coming in on an even boundary means having to do a load var
                        # actually, there are a bunch of things we can do on this bulk load
                        # to avoid having to be 'cautious_backwards' but I'm not completely
                        # sure they are good ideas
                        init_string = self.bulk_load_type.load_expr_data(load_offset,
                                                                         code = "cautious_backward")
                        var_name = "current_data_%d" % load_offset
                        lb_var = ctxt.get_var(None, var_name, reader = False, writer = True)
                        print "                %s = %s;" % (lb_var.name, init_string)
                print "                goto off%d;" % i
            print "            case %d: goto skipSwitch;" % self.loop_bytes
            print "        }"
            print "        {"
        for off in range(self.loop_bytes):
            # X_mod is the offset we're up to relative to the last X operation
            # X_offset is which of the last X operations matches this iteration
            if (switch_variant):
                LabelStep(ctxt, off)
            if off in bulk_load_steps:
                if not self.fdr2_force_naive_load:
                    BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant)
            if off in stride_offsets:
                if switch_variant:
                    OpenScopeStep(ctxt, off)
                ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious)
                TableLookupStep(ctxt, self.reach_mult, off)
                if off % self.extract_frequency:
                    ShiftReachMaskStep(ctxt, off)
                so = OrStep(ctxt, off, self.state_width)
                if switch_variant:
                    CloseScopeStep(ctxt, off)
                if sh != None:
                    so.add_dependency(sh)
                so_steps_last_block += [ so ]
            extract_mod = off % self.extract_frequency
            extract_offset = off - extract_mod
            extract_ready = extract_mod == self.extract_frequency - 1
            if extract_ready:
                if switch_variant:
                    OpenScopeStep(ctxt, off)
                ex = ConfExtractStep(ctxt, extract_offset)
                ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False)
                for so_step in so_steps_last_block:
                    ex.add_dependency(so_step)
                if switch_variant:
                    CloseScopeStep(ctxt, off)
                so_steps_last_block = []
                sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency)
                sh.add_dependency(ex)
            confirm_mod = off % self.confirm_frequency
            confirm_offset = off - confirm_mod
            confirm_ready = confirm_mod == self.confirm_frequency - 1
            if confirm_ready:
                cflip = ConfirmFlipStep(ctxt, confirm_offset)
                cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious )
                if last_confirm:
                    cf.add_dependency(last_confirm)
                last_confirm = cf
        if not switch_variant:
            print ctxt.schedule([ last_confirm, sh ])
        else:
            print ctxt.dontschedule([ last_confirm, sh ])
        if switch_variant:
            print "skipSwitch:;"
            print "    ptr += iterBytes;"
        print "    }" # close extra scope around switch variant
        print "    }"
    def produce_init_state(self):
        state = self.state_variable
        s_type = self.state_type
        shift_distance = -1 * self.num_buckets
        shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
        s = Template("""
    $TYPENAME s;
    if (a->len_history) {
        u32 tmp = 0;
        if (a->start_offset == 0) {
            tmp = a->buf_history[a->len_history - 1];
            tmp |= (a->buf[0] << 8);
        } else {
            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
        }
        tmp &= fdr->domainMask;
        s = *((const $TYPENAME *)ft + tmp);
        $SHIFT_EXPR;
    } else {
        s = *(const $TYPENAME *)&fdr->start;
    }
 """).substitute(TYPENAME = s_type.get_name(),
                ZERO_EXPR = s_type.zero_expression(),
                SHIFT_EXPR = shift_expr)
        return s
    def produce_code(self):
        loop_read_behind = 0
        loop_read_ahead = self.loop_bytes + 1
        # we set up mask and shift stuff for extracting our masks from registers
        #
        # we have a choice as to whether to mask out the value early or
        # extract the value (shift first) then mask it
        #
        # Intel has a free scaling factor from 1/2/4/8 so we want to combine
        # the extra needed shift for SSE registers with the mask operation
        ssb = self.state_type.size / 8 # state size in bytes
        # Intel path
        if ssb == 16:
            # obscure corner - we don't have the room in the register to
            # do this for all values so we don't. domain==16 is pretty
            # bad anyhow, of course
            self.reach_mult = 8
        else:
            self.reach_mult = ssb
        shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
        self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
        print self.produce_header(visible = False)
        print "// ",
        print " Arch: " + self.arch.name,
        print " State type: " + self.state_type.get_name(),
        print " Num buckets: %d" % self.num_buckets,
        print " Stride: %d" % self.stride
        print self.produce_common_declarations()
        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
        print
        print "    u64a domain_mask = fdr->domainMask;"
        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
        print self.produce_init_state()
        print "    const size_t iterBytes = %d;" % self.loop_bytes
        print "    const size_t START_MOD = %d;" % self.datasize_bytes
        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
        print """
    while (ptr < buf + len) {
        u8 doMainLoop = 1;
        size_t remaining = len - (ptr - buf);
        size_t dist;
        if (remaining <= iterBytes) {
            dist = remaining; // once through the switch and we're done
        } else if (remaining < 2 * iterBytes) {
            // nibble some stuff off the front, skip the main loop,
            // then come back here
            dist = iterBytes;  // maybe could be cleverer
        } else {
            // now, we need to see if we can make it to a main loop iteration
            // if so, we need to ensure that the main loop iteration is aligned
            // to a START_MOD boundary and i >= 8 so we can read ptr + i - 8
            // see if we can do it - if not, just switch the main loop off,
            // eat iterBytes in cautious mode, and come back to this loop
            const u8 * target = MAX(buf + 8, ptr);
            target = ROUNDUP_PTR(target, START_MOD);
            dist = target - ptr;
            if (dist > iterBytes) {
                doMainLoop = 0;
                dist = iterBytes;
            }
        }
 """
        self.produce_main_loop(switch_variant = True)
        self.produce_main_loop(switch_variant = False)
        print """
    }
 """
        print self.produce_footer()
    def get_name(self):
        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
    def __init__(self, state_width, stride,
                 arch,
                 table_state_width = None,
                 num_buckets = 8,
                 extract_frequency = None,
                 confirm_frequency = None):
        # First - set up the values that are fundamental to how this matcher will operate
        self.arch = arch
        # get the width of the state width on which we operate internally
        if state_width not in [ 128 ]:
            fail_out("Unknown state width: %d" % state_width)
        self.state_width = state_width
        self.state_type = getRequiredType(self.state_width)
        self.state_variable = IntegerVariable("s", self.state_type)
        table_state_width = state_width
        self.table_state_width = state_width
        self.table_state_type = getRequiredType(self.table_state_width)
        # this is the load type required for domain [9:15] if we want to
        # load it one at a time
        self.single_load_type = IntegerType(16)
        # stride is the frequency with which we make data-driven
        # accesses to our reach table
        if stride not in [ 1, 2, 4, 8]:
            fail_out("Unsupported stride: %d" % stride)
        if stride * num_buckets > state_width:
            fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width))
        self.stride = stride
        if num_buckets != 8:
            fail_out("Unsupported number of buckets: %d" % num_buckets)
        if state_width % num_buckets and state_width == 128:
            fail_out("Bucket scheme requires bit-shifts on m128 (failing)")
        self.num_buckets = num_buckets
        # Second - set up derived or optimization values - these can be
        # overridden by arguments that are passed in
        self.datasize = 64
        self.bulk_load_type = IntegerType(self.datasize)
        self.datasize_bytes = self.datasize/8
        self.value_extract_type = IntegerType(self.datasize)
        self.fdr2_force_naive_load = False # disable everywhere for trunk
        # extract frequency is how frequently (in bytes) we destructively shift
        # our state value after having pulled out that many bytes into a
        # confirm register (of one sort or another).
        # none means a default value - datasize, our biggest easily available GPR
        if extract_frequency is None:
            extract_frequency = self.datasize_bytes
        self.extract_frequency = extract_frequency
        self.extract_size = self.extract_frequency*self.num_buckets
        if extract_frequency < stride:
            fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride))
        if extract_frequency not in [ None, 1, 2, 4, 8, 16]:
            fail_out("Weird extract frequency: %d" % extract_frequency)
        if self.extract_size <= 32:
            self.extr_type = IntegerType(32)
        elif self.extract_size <= 64:
            self.extr_type = IntegerType(64)
        else:
            fail_out("Implausible size %d required for confirm extract step" % size)
        # extract_frequency is how often we pull out our state and place
        # it somewhere in a lossless fashion
        # confirm_frequency, on the other hand, is how frequently we
        # take the state extracted by extract_frequency and cobble it
        # together into a matching loop
        # confirm_frequency must be a multiple of extract_frequency
        # and must fit into a fast register; for now; we're going to
        # stay in the GPR domain
        if confirm_frequency is None:
            confirm_frequency = self.extract_frequency
        self.confirm_frequency = confirm_frequency
        if confirm_frequency % self.extract_frequency:
            fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency))
        self.conf_size = self.confirm_frequency * self.num_buckets
        if self.conf_size <= 32:
            self.conf_type = IntegerType(32)
        elif self.conf_size <= 64:
            self.conf_type = IntegerType(64)
        else:
            fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size)
        # how many bytes in flight at once
        self.loop_bytes = 16
        # confirm configuration
        # how many entries in the top-level confirm table - 256 means
        # complete split on the last character
        self.conf_top_level_split = 256
        # how much we 'pull back' in confirm - this is obviously related
        # to the first level conf but we will keep two separate paramters
        # for this to avoid the risk of conflating these
        self.conf_pull_back = 1
        if self.conf_pull_back > 0 and self.conf_top_level_split < 256:
            fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split))
        # minor stuff
        self.default_body_indent = 8
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,7 +29,7 @@
 /** \file
 * \brief FDR literal matcher: build API.
 */
-#include "fdr.h"
+
 #include "fdr_internal.h"
 #include "fdr_compile.h"
 #include "fdr_confirm.h"
@ -187,9 +187,9 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
    /*  we are allowing domains 9 to 15 only */
    assert(eng.bits > 8 && eng.bits < 16);
    fdr->domain = eng.bits;
    fdr->schemeWidthByte = eng.schemeWidth / 8;
    fdr->domainMask = (1 << eng.bits) - 1;
-    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+    fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
    fdr->stride = eng.stride;
    if (link.first) {
        fdr->link = verify_u32(ptr - fdr_base);
@ -544,6 +544,7 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
    // temporary hack for unit testing
    if (hint != HINT_INVALID) {
        des->bits = 9;
        des->stride = 1;
    }
    FDRCompiler fc(lits, *des, make_small);
@ -571,10 +572,9 @@ fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
 #endif
 } // namespace ue2
 // FIXME: should be compile-time only
 size_t fdrSize(const FDR *fdr) {
    assert(fdr);
    return fdr->size;
 }
 } // namespace ue2
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -61,6 +61,9 @@ fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
 #endif
 /** \brief Returns size in bytes of the given FDR engine. */
 size_t fdrSize(const struct FDR *fdr);
 } // namespace ue2
 #endif
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_compile_internal.h"
 #include "fdr_confirm.h"
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -36,144 +36,121 @@
 #include "util/bitutils.h"
 #include "util/compare.h"
 #define CONF_LOADVAL_CALL lv_u64a
 #define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce
 // this is ordinary confirmation function which runs through
 // the whole confirmation procedure
 static really_inline
-void confWithBit(const struct FDRConfirm * fdrc,
+void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
-                 const struct FDR_Runtime_Args * a,
+                 size_t i, u32 pullBackAmount, hwlmcb_rv_t *control,
-                 size_t i,
+                 u32 *last_match, u64a conf_key) {
                 CautionReason r,
                 u32 pullBackAmount,
                 hwlmcb_rv_t *control,
                 u32 * last_match) {
    assert(i < a->len);
    assert(ISALIGNED(fdrc));
    const u8 * buf = a->buf;
-    const size_t len = a->len;
+    u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
-
+                           fdrc->nBitsOrSoleID);
-    CONF_TYPE v;
+    u32 start = getConfirmLitIndex(fdrc)[c];
-    const u8 * confirm_loc = buf + i - pullBackAmount - 7;
+    if (likely(!start)) {
-    if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) {
+        return;
        v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len);
    } else { // r == VECTORING, confirm_loc < buf
        u64a histBytes = a->histBytes;
        v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len);
        // stitch together v (which doesn't move) and history (which does)
        u32 overhang = buf - confirm_loc;
        histBytes >>= 64 - (overhang * 8);
        v |= histBytes;
    }
-    u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID);
+    const struct LitInfo *li
-    u32 start = getConfirmLitIndex(fdrc)[c];
+        = (const struct LitInfo *)((const u8 *)fdrc + start);
    if (P0(start)) {
        const struct LitInfo *l =
            (const struct LitInfo *)((const u8 *)fdrc + start);
-        u8 oldNext; // initialized in loop
+    u8 oldNext; // initialized in loop
-        do {
+    do {
-            assert(ISALIGNED(l));
+        assert(ISALIGNED(li));
-            if (P0( (v & l->msk) != l->v)) {
+        if (unlikely((conf_key & li->msk) != li->v)) {
            goto out;
        }
        if ((*last_match == li->id) && (li->flags & NoRepeat)) {
            goto out;
        }
        const u8 *loc = buf + i - li->size + 1 - pullBackAmount;
        u8 caseless = li->flags & Caseless;
        if (loc < buf) {
            u32 full_overhang = buf - loc;
            const u8 *history = caseless ? a->buf_history_nocase
                                         : a->buf_history;
            size_t len_history = caseless ? a->len_history_nocase
                                          : a->len_history;
            // can't do a vectored confirm either if we don't have
            // the bytes
            if (full_overhang > len_history) {
                goto out;
            }
-            if ((*last_match == l->id) && (l->flags & NoRepeat)) {
+            // as for the regular case, no need to do a full confirm if
-                goto out;
+            // we're a short literal
            if (unlikely(li->size > sizeof(CONF_TYPE))) {
                const u8 *s1 = li->s;
                const u8 *s2 = s1 + full_overhang;
                const u8 *loc1 = history + len_history - full_overhang;
                const u8 *loc2 = buf;
                size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE));
                size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang;
                size_t size2 = wind_size2_back > li->size ?
                    0 : li->size - wind_size2_back;
                if (cmpForward(loc1, s1, size1, caseless)) {
                    goto out;
                }
                if (cmpForward(loc2, s2, size2, caseless)) {
                    goto out;
                }
            }
        } else { // NON-VECTORING PATH
-            const u8 * loc = buf + i - l->size + 1 - pullBackAmount;
+            // if string < conf_type we don't need regular string cmp
            if (unlikely(li->size > sizeof(CONF_TYPE))) {
                if (cmpForward(loc, li->s, li->size - sizeof(CONF_TYPE),
                               caseless)) {
                    goto out;
                }
            }
        }
-            u8 caseless = l->flags & Caseless;
+        if (unlikely(!(li->groups & *control))) {
-            if (loc < buf) {
+            goto out;
-                u32 full_overhang = buf - loc;
+        }
-                const u8 * history = (caseless) ?
+        if (unlikely(li->flags & ComplexConfirm)) {
-                                      a->buf_history_nocase : a->buf_history;
+            const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
-                size_t len_history = (caseless) ?
+            if (loc2 < buf) {
-                                      a->len_history_nocase : a->len_history;
+                u32 full_overhang = buf - loc2;
-
+                size_t len_history = caseless ? a->len_history_nocase
-                // can't do a vectored confirm either if we don't have
+                                              : a->len_history;
                // the bytes
                if (full_overhang > len_history) {
                    goto out;
                }
                // as for the regular case, no need to do a full confirm if
                // we're a short literal
                if (unlikely(l->size > sizeof(CONF_TYPE))) {
                    const u8 * s1 = l->s;
                    const u8 * s2 = s1 + full_overhang;
                    const u8 * loc1 = history + len_history - full_overhang;
                    const u8 * loc2 = buf;
                    size_t size1 = MIN(full_overhang,
                                       l->size - sizeof(CONF_TYPE));
                    size_t wind_size2_back = sizeof(CONF_TYPE) +
                                             full_overhang;
                    size_t size2 = wind_size2_back > l->size ?
                                   0 : l->size - wind_size2_back;
                    if (cmpForward(loc1, s1, size1, caseless)) {
                        goto out;
                    }
                    if (cmpForward(loc2, s2, size2, caseless)) {
                        goto out;
                    }
                }
            } else { // NON-VECTORING PATH
                // if string < conf_type we don't need regular string cmp
                if (unlikely(l->size > sizeof(CONF_TYPE))) {
                    if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) {
                        goto out;
                    }
                }
            }
        }
-            if (P0(!(l->groups & *control))) {
+        *last_match = li->id;
-                goto out;
+        *control = a->cb(loc - buf, i, li->id, a->ctxt);
-            }
+    out:
-
+        oldNext = li->next; // oldNext is either 0 or an 'adjust' value
-            if (unlikely(l->flags & ComplexConfirm)) {
+        li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size);
-                const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount;
+    } while (oldNext);
                if (loc2 < buf) {
                    u32 full_overhang = buf - loc2;
                    size_t len_history = (caseless) ?
                                          a->len_history_nocase : a->len_history;
                    if (full_overhang > len_history) {
                        goto out;
                    }
                }
            }
            *last_match = l->id;
            *control = a->cb(loc - buf, i, l->id, a->ctxt);
 out:
            oldNext = l->next; // oldNext is either 0 or an 'adjust' value
            l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size);
        } while (oldNext);
    }
 }
 // 'light-weight' confirmation function which is used by 1-mask Teddy;
 // in the 'confirmless' case it simply calls callback function,
 // otherwise it calls 'confWithBit' function for the full confirmation procedure
 static really_inline
-void confWithBit1(const struct FDRConfirm * fdrc,
+void confWithBit1(const struct FDRConfirm *fdrc,
-                  const struct FDR_Runtime_Args * a,
+                  const struct FDR_Runtime_Args *a, size_t i,
-                  size_t i,
+                  hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
                  CautionReason r,
                  hwlmcb_rv_t *control,
                  u32 * last_match) {
    assert(i < a->len);
    assert(ISALIGNED(fdrc));
    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
        return;
    } else {
        u32 id = fdrc->nBitsOrSoleID;
@ -190,12 +167,9 @@ void confWithBit1(const struct FDRConfirm * fdrc,
 // In the 'confirmless' case it makes fast 32-bit comparison,
 // otherwise it calls 'confWithBit' function for the full confirmation procedure
 static really_inline
-void confWithBitMany(const struct FDRConfirm * fdrc,
+void confWithBitMany(const struct FDRConfirm *fdrc,
-                     const struct FDR_Runtime_Args * a,
+                     const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
-                     size_t i,
+                     hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
                     CautionReason r,
                     hwlmcb_rv_t *control,
                     u32 * last_match) {
    assert(i < a->len);
    assert(ISALIGNED(fdrc));
@ -204,7 +178,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
    }
    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
        return;
    } else {
        const u32 id = fdrc->nBitsOrSoleID;
@ -215,7 +189,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
        }
        if (r == VECTORING && len > i - a->start_offset) {
-            if (len > (i + a->len_history)) {
+            if (len > i + a->len_history) {
                return;
            }
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -28,11 +28,11 @@
 #include "config.h"
-#include "fdr.h"
+#include "fdr_compile.h"
 #include "fdr_internal.h"
 #include "fdr_compile_internal.h"
 #include "fdr_dump.h"
 #include "fdr_engine_description.h"
 #include "fdr_internal.h"
 #include "teddy_engine_description.h"
 #include "ue2common.h"
@ -68,8 +68,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
    }
    if (isTeddy) {
-        unique_ptr<TeddyEngineDescription> des =
+        auto des = getTeddyDescription(fdr->engineID);
            getTeddyDescription(fdr->engineID);
        if (des) {
            fprintf(f, "    masks      %u\n", des->numMasks);
            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
@ -78,16 +77,8 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
            fprintf(f, "   <unknown engine>\n");
        }
    } else {
-        unique_ptr<FDREngineDescription> des =
+        fprintf(f, "    domain     %u\n", fdr->domain);
-            getFdrDescription(fdr->engineID);
+        fprintf(f, "    stride     %u\n", fdr->stride);
        if (des) {
            fprintf(f, "    domain     %u\n", des->bits);
            fprintf(f, "    stride     %u\n", des->stride);
            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
            fprintf(f, "    width      %u\n", des->schemeWidth);
        } else {
            fprintf(f, "   <unknown engine>\n");
        }
    }
    fprintf(f, "    strings    ???\n");
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,13 +42,11 @@ using namespace std;
 namespace ue2 {
 #include "fdr_autogen_compiler.cpp"
 FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                        def.numBuckets, def.confirmPullBackDistance,
                        def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
+      schemeWidth(def.schemeWidth), stride(0), bits(0) {}
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
    // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@ -56,6 +54,12 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
    return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1;
 }
 void getFdrDescriptions(vector<FDREngineDescription> *out) {
    static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
    out->clear();
    out->emplace_back(def);
 }
 static
 u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
    u32 desiredStride = 1; // always our safe fallback
@ -108,32 +112,33 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
    FDREngineDescription *best = nullptr;
    u32 best_score = 0;
    FDREngineDescription &eng = allDescs[0];
    for (u32 domain = 9; domain <= 15; domain++) {
-        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+        for (size_t stride = 1; stride <= 4; stride *= 2) {
            // to make sure that domains >=14 have stride 1 according to origin
-            if (domain > 13 && engineID > 0) {
+            if (domain > 13 && stride > 1) {
                continue;
            }
            FDREngineDescription &eng = allDescs[engineID];
            if (!eng.isValidOnTarget(target)) {
                continue;
            }
-            if (msl < eng.stride) {
+            if (msl < stride) {
                continue;
            }
            u32 score = 100;
-            score -= absdiff(desiredStride, eng.stride);
+            score -= absdiff(desiredStride, stride);
-            if (eng.stride <= desiredStride) {
+            if (stride <= desiredStride) {
-                score += eng.stride;
+                score += stride;
            }
            u32 effLits = vl.size(); /* * desiredStride;*/
            u32 ideal;
            if (effLits < eng.getNumBuckets()) {
-                if (eng.stride == 1) {
+                if (stride == 1) {
                    ideal = 8;
                } else {
                    ideal = 10;
@ -158,27 +163,28 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
                ideal -= 2;
            }
-            if (eng.stride > 1) {
+            if (stride > 1) {
                ideal++;
            }
            DEBUG_PRINTF("effLits %u\n", effLits);
            if (target.is_atom_class() && !make_small && effLits < 4000) {
-                /* Unless it is a very heavy case, we want to build smaller tables
+                /* Unless it is a very heavy case, we want to build smaller
-                 * on lightweight machines due to their small caches. */
+                 * tables on lightweight machines due to their small caches. */
                ideal -= 2;
            }
            score -= absdiff(ideal, domain);
-            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+            DEBUG_PRINTF("fdr %u: width=%u, domain=%u, buckets=%u, stride=%zu "
                         "-> score=%u\n",
-                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getID(), eng.schemeWidth, domain,
-                         eng.getNumBuckets(), eng.stride, score);
+                         eng.getNumBuckets(), stride, score);
            if (!best || score > best_score) {
                eng.bits = domain;
                eng.stride = stride;
                best = &eng;
                best_score = score;
            }
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,6 @@ struct FDREngineDef {
    u32 id;
    u32 schemeWidth;
    u32 numBuckets;
    u32 stride;
    u64a cpu_features;
    u32 confirmPullBackDistance;
    u32 confirmTopLevelSplit;
@ -73,7 +72,6 @@ chooseEngine(const target_t &target, const std::vector<hwlmLiteral> &vl,
             bool make_small);
 std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID);
 void getFdrDescriptions(std::vector<FDREngineDescription> *out);
 } // namespace ue2
 #endif
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -76,17 +76,17 @@ struct FDR {
     * structures (spillover strings and hash table) if we're a secondary
     * structure. */
    u32 link;
-    u8 domain; /* dynamic domain info */
+    u8 stride; /* stride - how frequeuntly the data is consulted by the first
-    u8 schemeWidthByte;  /* scheme width in bytes */
+                * stage matcher */
    u8 domain; /* number of bits used to index into main FDR table. This value
                * is used only of debugging/asserts. */
    u16 domainMask; /* pre-computed domain mask */
    u32 tabSize; /* pre-computed hashtable size in bytes */
-    u32 pad1;
+    u32 pad;
-    union {
+    m128 start; /* initial start state to use at offset 0. The state has been set
-        u32 s_u32;
+                 * up based on the min length of buckets to reduce the need for
-        u64a s_u64a;
+                 * pointless confirms. */
        m128 s_m128;
    } start;
 };
 /** \brief FDR runtime arguments.
--- a/src/fdr/fdr_loadval.h
+++ b/src/fdr/fdr_loadval.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,188 +29,43 @@
 #ifndef FDR_LOADVAL_H
 #define FDR_LOADVAL_H
 #include "fdr_internal.h"
 #include "ue2common.h"
 #include "util/unaligned.h"
 #include "util/simd_utils.h"
-#define MAKE_LOADVAL(type, name) \
+#define MAKE_LOADVAL(type, name)                \
-    static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi)
+    static really_inline                                                \
    type name(const u8 *ptr, UNUSED const u8 *lo, UNUSED const u8 *hi)
-#define NORMAL_SAFE(type)            assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi)
+#define NORMAL_SAFE(type)                                               \
-#define ALIGNED_SAFE(type)           NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0);
+    do {                                                                \
-// these ones need asserts to test the property that we're not handling dynamically
+        assert(ptr >= lo);                                              \
-#define CAUTIOUS_FORWARD_SAFE(type)  assert(ptr >= lo)
+        assert(ptr + sizeof(type) - 1 < hi);                            \
-#define CAUTIOUS_BACKWARD_SAFE(type) assert((ptr + sizeof(type) - 1) < hi)
+    } while(0)
-#define CF_INDEX_CHECK                        (ptr + i < hi)
+#define MAKE_LOOP_CE(TYPE)                                              \
-#define CB_INDEX_CHECK     (lo <= ptr + i)
+    TYPE v = 0;                                                         \
-#define CE_INDEX_CHECK     (lo <= ptr + i) && (ptr + i < hi)
+    for (TYPE i = 0; i < sizeof(TYPE); i++) {                           \
-
+        if ((lo <= ptr + i) && (ptr + i < hi)) {                        \
-#define MAKE_LOOP(TYPE, COND, SHIFT_FIDDLE)                                    \
+            v += (TYPE)ptr[i] << (i*8);                                 \
-    TYPE v = 0;                                                                \
+        }                                                               \
-    for (TYPE i = 0; i < sizeof(TYPE); i++) {                                  \
+    }                                                                   \
        if (COND) {                                                            \
            v += (TYPE)ptr[i] << ((SHIFT_FIDDLE)*8);                           \
        }                                                                      \
    }                                                                          \
    return v;
 #define MAKE_LOOP_BE(TYPE, COND) \
    MAKE_LOOP(TYPE, COND, sizeof(TYPE)-i-1)
 #define MAKE_LOOP_LE(TYPE, COND) \
    MAKE_LOOP(TYPE, COND, i)
 #define MAKE_LOOP_BE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_BE(TYPE, CF_INDEX_CHECK)
 #define MAKE_LOOP_BE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_BE(TYPE, CB_INDEX_CHECK)
 #define MAKE_LOOP_BE_CE(TYPE)                               MAKE_LOOP_BE(TYPE, CE_INDEX_CHECK)
 #define MAKE_LOOP_LE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_LE(TYPE, CF_INDEX_CHECK)
 #define MAKE_LOOP_LE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_LE(TYPE, CB_INDEX_CHECK)
 #define MAKE_LOOP_LE_CE(TYPE)                               MAKE_LOOP_LE(TYPE, CE_INDEX_CHECK)
 // no suffix = normal (unaligned)
 // _a        = aligned
 // _cf       = cautious forwards, base is always in bounds, but may read over the end of the buffer (test against hi)
 // _cb       = cautious backwards, final byte is always in bounds, but may read over the start of the buffer (test against lo)
 // _ce       = cautious everywhere (in both directions); test against hi and lo
 // u8 loadvals
 MAKE_LOADVAL(u8, lv_u8) {
    NORMAL_SAFE(u8);
    return *ptr;
 }
 MAKE_LOADVAL(u8, lv_u8_cf) {
    CAUTIOUS_FORWARD_SAFE(u8);
    if (ptr < hi) {
        return *ptr;
    } else {
        return 0;
    }
 }
 MAKE_LOADVAL(u8, lv_u8_cb) {
    CAUTIOUS_BACKWARD_SAFE(u8);
    if (lo <= ptr) {
        return *ptr;
    } else {
        return 0;
    }
 }
 MAKE_LOADVAL(u8, lv_u8_ce) {
    if ((lo <= ptr) && (ptr < hi)) {
        return *ptr;
    } else {
        return 0;
    }
 }
 MAKE_LOADVAL(u16, lv_u16) {
    NORMAL_SAFE(u16);
    return unaligned_load_u16(ptr);
 }
 MAKE_LOADVAL(u16, lv_u16_a) {
    ALIGNED_SAFE(u16);
    return *(const u16 *)ptr;
 }
 MAKE_LOADVAL(u32, lv_u32) {
    NORMAL_SAFE(u32);
    return unaligned_load_u32(ptr);
 }
 MAKE_LOADVAL(u32, lv_u32_a) {
    ALIGNED_SAFE(u32);
    return *(const u32 *)ptr;
 }
 MAKE_LOADVAL(u64a, lv_u64a) {
    NORMAL_SAFE(u32);
    return unaligned_load_u64a(ptr);
 }
-MAKE_LOADVAL(u64a, lv_u64a_a) {
+MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_CE(u16); }
    ALIGNED_SAFE(u64a);
    return *(const u64a *)ptr;
 }
-MAKE_LOADVAL(u16, lv_u16_cf) { MAKE_LOOP_LE_CF(u16); }
+MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_CE(u64a); }
 MAKE_LOADVAL(u16, lv_u16_cb) { MAKE_LOOP_LE_CB(u16); }
 MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_LE_CE(u16); }
 MAKE_LOADVAL(u32, lv_u32_cf) { MAKE_LOOP_LE_CF(u32); }
 MAKE_LOADVAL(u32, lv_u32_cb) { MAKE_LOOP_LE_CB(u32); }
 MAKE_LOADVAL(u32, lv_u32_ce) { MAKE_LOOP_LE_CE(u32); }
 MAKE_LOADVAL(u64a, lv_u64a_cf) { MAKE_LOOP_LE_CF(u64a); }
 MAKE_LOADVAL(u64a, lv_u64a_cb) { MAKE_LOOP_LE_CB(u64a); }
 MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_LE_CE(u64a); }
 MAKE_LOADVAL(m128, lv_m128) {
    NORMAL_SAFE(m128);
    return loadu128(ptr);
 }
 MAKE_LOADVAL(m128, lv_m128_a) {
    ALIGNED_SAFE(m128);
    assert((size_t)ptr % sizeof(m128) == 0);
    return *(const m128 *)ptr;
 }
 // m128 cases need to be manually created
 MAKE_LOADVAL(m128, lv_m128_cf) {
    CAUTIOUS_FORWARD_SAFE(m128);
    union {
        u8 val8[16];
        m128 val128;
    } u;
    for (u32 i = 0; i < 16; i++) {
        if (ptr + i < hi) {
            u.val8[i] = ptr[i];
        } else {
            u.val8[i] = 0;
        }
    }
    return u.val128;
 }
 MAKE_LOADVAL(m128, lv_m128_cb) {
    CAUTIOUS_BACKWARD_SAFE(m128);
    union {
        u8 val8[16];
        m128 val128;
    } u;
    for (u32 i = 0; i < 16; i++) {
        if (lo <= ptr + i) {
            u.val8[i] = ptr[i];
        } else {
            u.val8[i] = 0;
        }
    }
    return u.val128;
 }
 MAKE_LOADVAL(m128, lv_m128_ce) {
    union {
        u8 val8[16];
        m128 val128;
    } u;
    for (u32 i = 0; i < 16; i++) {
        if ((lo <= ptr + i) && (ptr + i < hi)) {
            u.val8[i] = ptr[i];
        } else {
            u.val8[i] = 0;
        }
    }
    return u.val128;
 }
 #endif
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_streaming_internal.h"
 #include "fdr_compile_internal.h"
--- a/src/fdr/fdr_streaming_runtime.h
+++ b/src/fdr/fdr_streaming_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -32,6 +32,8 @@
 #include "fdr_streaming_internal.h"
 #include "util/partial_store.h"
 #include <string.h>
 static really_inline
 const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) {
    const u8 * linkPtr = ((const u8 *)fdr) + fdr->link;
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_confirm.h"
 #include "fdr_compile_internal.h"
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,11 +26,19 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
-#include "config.h"
+/** \file
 * \brief Teddy literal matcher: SSSE3 engine runtime.
 */
 #include "fdr_internal.h"
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
-static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -67,178 +75,584 @@ static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
 };
-// Note: p_mask is an output param that initialises a poison mask.
+#ifdef ARCH_64_BIT
-UNUSED static really_inline
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+do {                                                                        \
-                     const u8 *buf_history, size_t len_history,
+    if (unlikely(isnonzero128(var))) {                                      \
-                     const u32 nMasks) {
+        u64a lo = movq(var);                                                \
-    union {
+        u64a hi = movq(byteShiftRight128(var, 8));                          \
-        u8 val8[16];
+        if (unlikely(lo)) {                                                 \
-        m128 val128;
+            conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
-    } u;
+                    control, &last_match);                                  \
-    u.val128 = zeroes128();
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
        if (unlikely(hi)) {                                                 \
            conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
                    control, &last_match);                                  \
            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
    }                                                                       \
 } while (0);
 #else
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
    if (unlikely(isnonzero128(var))) {                                      \
        u32 part1 = movd(var);                                              \
        u32 part2 = movd(byteShiftRight128(var, 4));                        \
        u32 part3 = movd(byteShiftRight128(var, 8));                        \
        u32 part4 = movd(byteShiftRight128(var, 12));                       \
        if (unlikely(part1)) {                                              \
            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                    control, &last_match);                                  \
            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
        if (unlikely(part2)) {                                              \
            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
                    control, &last_match);                                  \
            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
        if (unlikely(part3)) {                                              \
            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
                    control, &last_match);                                  \
            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
        if (unlikely(part4)) {                                              \
            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
                    control, &last_match);                                  \
            CHECK_HWLM_TERMINATE_MATCHING;                                  \
        }                                                                   \
    }                                                                       \
 } while (0);
 #endif
-    if (ptr >= lo) {
+static really_inline
-        u32 avail = (u32)(hi - ptr);
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
-        if (avail >= 16) {
+    m128 mask = set16x8(0xf);
-            *p_mask = load128((const void*)(p_mask_arr[16] + 16));
+    m128 lo = and128(val, mask);
-            return loadu128(ptr);
+    m128 hi = and128(rshift2x64(val, 4), mask);
-        }
+    return and128(and128(pshufb(maskBase[0*2], lo),
-        *p_mask = load128((const void*)(p_mask_arr[avail] + 16));
+                         pshufb(maskBase[0*2+1], hi)), p_mask);
-        for (u32 i = 0; i < avail; i++) {
+}
-            u.val8[i] = ptr[i];
+
-        }
+static really_inline
-    } else {
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
-        u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
+                        m128 val) {
-        u32 start = (u32)(lo - ptr);
+    m128 mask = set16x8(0xf);
-        u32 i;
+    m128 lo = and128(val, mask);
-        for (i = start - need; ptr + i < lo; i++) {
+    m128 hi = and128(rshift2x64(val, 4), mask);
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+    m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
-        }
+
-        u32 end = MIN(16, (u32)(hi - ptr));
+    m128 res_1 = and128(pshufb(maskBase[1*2], lo),
-        *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
+                        pshufb(maskBase[1*2+1], hi));
-        for (; i < end; i++) {
+    m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
-            u.val8[i] = ptr[i];
+    *old_1 = res_1;
-        }
+    return and128(and128(r, p_mask), res_shifted_1);
 }
 static really_inline
 m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
                        m128 p_mask, m128 val) {
    m128 mask = set16x8(0xf);
    m128 lo = and128(val, mask);
    m128 hi = and128(rshift2x64(val, 4), mask);
    m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
    m128 res_2 = and128(pshufb(maskBase[2*2], lo),
                        pshufb(maskBase[2*2+1], hi));
    m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
    *old_2 = res_2;
    return and128(r, res_shifted_2);
 }
 static really_inline
 m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
                        m128 *old_3, m128 p_mask, m128 val) {
    m128 mask = set16x8(0xf);
    m128 lo = and128(val, mask);
    m128 hi = and128(rshift2x64(val, 4), mask);
    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
    m128 res_3 = and128(pshufb(maskBase[3*2], lo),
                        pshufb(maskBase[3*2+1], hi));
    m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
    *old_3 = res_3;
    return and128(r, res_shifted_3);
 }
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 1);
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 1);
        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
        ptr += 16;
    }
-    return u.val128;
+    if (ptr + 16 < buf_end) {
-}
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
-
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-
+        ptr += 16;
 #if defined(__AVX2__)
 UNUSED static really_inline
 m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    m128 p_mask128;
    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
    *p_mask = set2x128(p_mask128);
    return ret;
 }
 static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
 };
 UNUSED static really_inline
 m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history) {
    union {
        u8 val8[32];
        m256 val256;
    } u;
    if (ptr >= lo) {
        u32 avail = (u32)(hi - ptr);
        if (avail >= 32) {
            *p_mask = load256((const void*)(p_mask_arr256[32] + 32));
            return loadu256(ptr);
        }
        *p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
        for (u32 i = 0; i < avail; i++) {
            u.val8[i] = ptr[i];
        }
    } else {
        // need contains "how many chars to pull from history"
        // calculate based on what we need, what we have in the buffer
        // and only what we need to make primary confirm work
        u32 start = (u32)(lo - ptr);
        u32 i;
        for (i = start; ptr + i < lo; i++) {
            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
        }
        u32 end = MIN(32, (u32)(hi - ptr));
        *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
        for (; i < end; i++) {
            u.val8[i] = ptr[i];
        }
    }
-    return u.val256;
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 1);
        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
-#endif // __AVX2__
+    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 1);
-#define P0(cnd) unlikely(cnd)
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 1);
        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
-#include "fdr.h"
+    if (ptr + 16 < buf_end) {
-#include "fdr_internal.h"
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
-#include "flood_runtime.h"
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
-#include "fdr_confirm.h"
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-#include "fdr_confirm_runtime.h"
+        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
    }
-#include "fdr_loadval.h"
+    for (; ptr < buf_end; ptr += 16) {
-#include "util/bitutils.h"
+        m128 p_mask;
-#include "teddy_internal.h"
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 1);
        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
-#include "teddy_autogen.c"
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 2);
    m128 res_old_1 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 2);
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 2);
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 2);
    m128 res_old_1 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 2);
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
                                      load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                   a->buf_history, a->len_history, 2);
        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 3);
    m128 res_old_1 = ones128();
    m128 res_old_2 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 3);
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 3);
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 3);
    m128 res_old_1 = ones128();
    m128 res_old_2 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 3);
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 3);
        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
                                      p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 4);
    m128 res_old_1 = ones128();
    m128 res_old_2 = ones128();
    m128 res_old_3 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 4);
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 4);
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a) {
    const u8 *buf_end = a->buf + a->len;
    const u8 *ptr = a->buf + a->start_offset;
    hwlmcb_rv_t controlVal = *a->groups;
    hwlmcb_rv_t *control = &controlVal;
    u32 floodBackoff = FLOOD_BACKOFF_START;
    const u8 *tryFloodDetect = a->firstFloodDetect;
    u32 last_match = (u32)-1;
    const struct Teddy *teddy = (const struct Teddy *)fdr;
    const size_t iterBytes = 32;
    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
                 a->buf, a->len, a->start_offset);
    const m128 *maskBase = getMaskBase(teddy);
    const u32 *confBase = getConfBase(teddy, 4);
    m128 res_old_1 = ones128();
    m128 res_old_2 = ones128();
    m128 res_old_3 = ones128();
    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
    if (ptr < mainStart) {
        ptr = mainStart - 16;
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 4);
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    if (ptr + 16 < buf_end) {
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
        ptr += 16;
    }
    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
        __builtin_prefetch(ptr + (iterBytes*4));
        CHECK_FLOOD;
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr));
        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, ones128(), load128(ptr + 16));
        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
    }
    for (; ptr < buf_end; ptr += 16) {
        m128 p_mask;
        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                     a->buf_history, a->len_history, 4);
        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
                                      &res_old_3, p_mask, val_0);
        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
    }
    *a->groups = controlVal;
    return HWLM_SUCCESS;
 }
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@ -0,0 +1,97 @@
 /*
 * Copyright (c) 2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Teddy literal matcher: function declarations.
 */
 #ifndef TEDDY_H_
 #define TEDDY_H_
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a);
 #if defined(__AVX2__)
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a);
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a);
 #endif /* __AVX2__ */
 #endif /* TEDDY_H_ */
--- a/src/fdr/teddy_autogen.py
+++ b/src/fdr/teddy_autogen.py
@ -1,545 +0,0 @@
 #!/usr/bin/python
 # Copyright (c) 2015, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
 #       notice, this list of conditions and the following disclaimer in the
 #       documentation and/or other materials provided with the distribution.
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import sys
 from autogen_utils import *
 from base_autogen import *
 from string import Template
 class MT(MatcherBase):
    def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
        if self.packed:
            print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
        else:
            if self.num_masks == 1:
                conf_func = "confWithBit1"
            else:
                conf_func = "confWithBitMany"
            if cautious:
                caution_string = "VECTORING"
            else:
                caution_string = "NOT_CAUTIOUS"
            print "            if (P0(!!%s)) {" % var_name
            print "                do  {"
            if bits == 64:
                print "                    bit = findAndClearLSB_64(&%s);" % (var_name)
            else:
                print "                    bit = findAndClearLSB_32(&%s);" % (var_name)
            print "                    byte  = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
            print "                    idx  = bit %% %d;" % self.num_buckets
            print "                    cf = confBase[idx];"
            print "                    fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
            print "                    if (!(fdrc->groups & *control))"
            print "                        continue;"
            print "                    %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string)
            print "                } while(P0(!!%s));" % var_name
            print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
            print "                    *a->groups = controlVal;"
            print "                    return HWLM_TERMINATED;"
            print "                }"
            print "            }"
    def produce_needed_temporaries(self, max_iterations):
        print "        m128 p_mask;"
        for iter in range(0, max_iterations):
            print "        m128 val_%d;" % iter
            print "        m128 val_%d_lo;" % iter
            print "        m128 val_%d_hi;" % iter
            for x in range(self.num_masks):
                print "        m128 res_%d_%d;" % (iter, x)
                if x != 0:
                    print "        m128 res_shifted_%d_%d;" % (iter, x)
            print "        m128 r_%d;" % iter
            print "#ifdef ARCH_64_BIT"
            print "            u64a r_%d_lopart;" % iter
            print "            u64a r_%d_hipart;" % iter
            print "#else"
            print "            u32 r_%d_part1;" % iter
            print "            u32 r_%d_part2;" % iter
            print "            u32 r_%d_part3;" % iter
            print "            u32 r_%d_part4;" % iter
            print "#endif"
    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
                                         cautious, save_old):
        if cautious:
            print "        val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
        else:
            print "        val_%d = load128(ptr + %d);" % (iter, iter*16)
        print "        val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
        print "        val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
        print "        val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
        print
        for x in range(self.num_masks):
            print Template("""
        res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2]  , val_${ITER}_lo),
                                  pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
            if x != 0:
                if iter == 0:
                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
                else:
                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
            if x != 0 and iter == effective_num_iterations - 1 and save_old:
                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
        print
        if cautious:
            print "        r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
        else:
            print "        r_%d = res_%d_0;" % (iter, iter)
        for x in range(1, self.num_masks):
            print "        r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
        print
    def produce_one_iteration_confirm(self, iter, confirmCautious):
        setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
                    (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
        setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
                    (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
                    (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
                    (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
        print "        if (P0(isnonzero128(r_%d))) {" % (iter)
        print "#ifdef ARCH_64_BIT"
        for (off, val, init) in setup64:
            print "            %s = %s;" % (val, init)
        for (off, val, init) in setup64:
            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
        print "#else"
        for (off, val, init) in setup32:
            print "            %s = %s;" % (val, init)
        for (off, val, init) in setup32:
            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
        print "#endif"
        print "        }"
    def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
                              confirmCautious = True, save_old = True):
        self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
        self.produce_one_iteration_confirm(iter, confirmCautious)
    def produce_code(self):
        print self.produce_header(visible = True, header_only = False)
        print self.produce_common_declarations()
        print
        self.produce_needed_temporaries(self.num_iterations)
        print
        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
                                ' buf, len, a->start_offset);'
        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
                                ' mainStart);'
        for x in range(self.num_masks):
            if (x != 0):
                print "    m128 res_old_%d = ones128();" % x
        print "    m128 lomask = set16x8(0xf);"
        print "    if (ptr < mainStart) {"
        print "         ptr = mainStart - 16;"
        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
        print "         ptr += 16;"
        print "    }"
        print "    if (ptr + 16 < buf + len) {"
        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
        print "         ptr += 16;"
        print "    }"
        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
        print "        __builtin_prefetch(ptr + (iterBytes*4));"
        print self.produce_flood_check()
        for iter in range(self.num_iterations):
            self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
        print "    }"
        print "    for (; ptr < buf + len; ptr += 16) {"
        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
        print "    }"
        print self.produce_footer()
    def produce_compile_call(self):
        packed_str = { False : "false", True : "true"}[self.packed]
        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
            self.conf_pull_back, self.conf_top_level_split)
    def get_name(self):
        if self.packed:
            pck_string = "_pck"
        else:
            pck_string = ""
        if self.num_buckets == 16:
            type_string = "_fat"
        else:
            type_string = ""
        return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
    def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
        self.arch = arch
        self.packed = packed
        self.num_masks = num_masks
        self.num_buckets = num_buckets
        self.num_iterations = 2
        if packed:
            self.conf_top_level_split = 32
        else:
            self.conf_top_level_split = 1
        self.conf_pull_back = 0
 class MTFat(MT):
    def produce_needed_temporaries(self, max_iterations):
        print "        m256 p_mask;"
        for iter in range(0, max_iterations):
            print "        m256 val_%d;" % iter
            print "        m256 val_%d_lo;" % iter
            print "        m256 val_%d_hi;" % iter
            for x in range(self.num_masks):
                print "        m256 res_%d_%d;" % (iter, x)
                if x != 0:
                    print "        m256 res_shifted_%d_%d;" % (iter, x)
            print "        m256 r_%d;" % iter
            print "#ifdef ARCH_64_BIT"
            print "            u64a r_%d_part1;" % iter
            print "            u64a r_%d_part2;" % iter
            print "            u64a r_%d_part3;" % iter
            print "            u64a r_%d_part4;" % iter
            print "#else"
            print "            u32 r_%d_part1;" % iter
            print "            u32 r_%d_part2;" % iter
            print "            u32 r_%d_part3;" % iter
            print "            u32 r_%d_part4;" % iter
            print "            u32 r_%d_part5;" % iter
            print "            u32 r_%d_part6;" % iter
            print "            u32 r_%d_part7;" % iter
            print "            u32 r_%d_part8;" % iter
            print "#endif"
    def produce_code(self):
        print self.produce_header(visible = True, header_only = False)
        print self.produce_common_declarations()
        print
        self.produce_needed_temporaries(self.num_iterations)
        print
        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
        print "    const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
                                ' buf, len, a->start_offset);'
        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
                                ' mainStart);'
        for x in range(self.num_masks):
            if (x != 0):
                print "    m256 res_old_%d = ones256();" % x
        print "    m256 lomask = set32x8(0xf);"
        print "    if (ptr < mainStart) {"
        print "         ptr = mainStart - 16;"
        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
        print "         ptr += 16;"
        print "    }"
        print "    if (ptr + 16 < buf + len) {"
        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
        print "         ptr += 16;"
        print "    }"
        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
        print "        __builtin_prefetch(ptr + (iterBytes*4));"
        print self.produce_flood_check()
        for iter in range(self.num_iterations):
            self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
        print "    }"
        print "    for (; ptr < buf + len; ptr += 16) {"
        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
        print "    }"
        print self.produce_footer()
    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
                                         cautious, save_old):
        if cautious:
            print "        val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
        else:
            print "        val_%d = load2x128(ptr + %d);" % (iter, iter*16)
        print "        val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
        print "        val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
        print "        val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
        print
        for x in range(self.num_masks):
            print Template("""
        res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2]  , val_${ITER}_lo),
                                  vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
            if x != 0:
                if iter == 0:
                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
                else:
                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
            if x != 0 and iter == effective_num_iterations - 1 and save_old:
                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
        print
        if cautious:
            print "        r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
        else:
            print "        r_%d = res_%d_0;" % (iter, iter)
        for x in range(1, self.num_masks):
            print "        r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
        print
    def produce_one_iteration_confirm(self, iter, confirmCautious):
        setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
                    (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
                    (8, "r_%d_part3" % iter, "extractlow64from256(r)"),
                    (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
        setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
                    (2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
                    (4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
                    (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
                    (8, "r_%d_part5" % iter, "extractlow32from256(r)"),
                    (10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
                    (12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
                    (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
        print "        if (P0(isnonzero256(r_%d))) {" % (iter)
        print "            m256 r_swap = swap128in256(r_%d);" % (iter)
        print "            m256 r = interleave256lo(r_%d, r_swap);" % (iter)
        print "#ifdef ARCH_64_BIT"
        for (off, val, init) in setup64:
            print "            %s = %s;" % (val, init)
        for (off, val, init) in setup64:
            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
        print "#else"
        for (off, val, init) in setup32:
            print "            %s = %s;" % (val, init)
        for (off, val, init) in setup32:
            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
        print "#endif"
        print "        }"
 class MTFast(MatcherBase):
    def produce_confirm(self, cautious):
        if cautious:
            cautious_str = "VECTORING"
        else:
            cautious_str = "NOT_CAUTIOUS"
        print "            for (u32 i = 0; i < arrCnt; i++) {"
        print "                byte = bitArr[i] / 8;"
        if self.packed:
            conf_split_mask = IntegerType(32).constant_to_string(
                                self.conf_top_level_split - 1)
            print "                bitRem  = bitArr[i] % 8;"
            print "                confSplit = *(ptr+byte) & 0x1f;"
            print "                idx = confSplit * %d + bitRem;" % self.num_buckets
            print "                cf = confBase[idx];"
            print "                if (!cf)"
            print "                    continue;"
            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
            print "                if (!(fdrc->groups & *control))"
            print "                    continue;"
            print "                confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str
        else:
            print "                cf = confBase[bitArr[i] % 8];"
            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
            print "                confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str
        print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
        print "                    *a->groups = controlVal;"
        print "                    return HWLM_TERMINATED;"
        print "                }"
        print "            }"
    def produce_needed_temporaries(self, max_iterations):
        print "        u32 arrCnt;"
        print "        u16 bitArr[512];"
        print "        m256 p_mask;"
        print "        m256 val_0;"
        print "        m256 val_0_lo;"
        print "        m256 val_0_hi;"
        print "        m256 res_0;"
        print "        m256 res_1;"
        print "        m128 lo_part;"
        print "        m128 hi_part;"
        print "#ifdef ARCH_64_BIT"
        print "        u64a r_0_part;"
        print "#else"
        print "        u32 r_0_part;"
        print "#endif"
    def produce_bit_scan(self, offset, bits):
        print "                while (P0(!!r_0_part)) {"
        if bits == 64:
            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
        else:
            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
        print "                }"
    def produce_bit_check_128(self, var_name, offset):
        print "            if (P0(isnonzero128(%s))) {" % (var_name)
        print "#ifdef ARCH_64_BIT"
        print "                r_0_part = movq(%s);" % (var_name)
        self.produce_bit_scan(offset, 64)
        print "                r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
        self.produce_bit_scan(offset + 1, 64)
        print "#else"
        print "                r_0_part = movd(%s);" % (var_name)
        self.produce_bit_scan(offset * 2, 32)
        for step in range(1, 4):
            print "                r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
            self.produce_bit_scan(offset * 2 + step, 32)
        print "#endif"
        print "            }"
    def produce_bit_check_256(self, iter, single_iter, cautious):
        print "        if (P0(isnonzero256(res_%d))) {" % (iter)
        if single_iter:
            print "            arrCnt = 0;"
        print "            lo_part = cast256to128(res_%d);" % (iter)
        print "            hi_part = cast256to128(swap128in256(res_%d));" % (iter)
        self.produce_bit_check_128("lo_part", iter * 4)
        self.produce_bit_check_128("hi_part", iter * 4 + 2)
        if single_iter:
            self.produce_confirm(cautious)
        print "        }"
    def produce_one_iteration_state_calc(self, iter, cautious):
        if cautious:
            print "        val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
        else:
            print "        val_0 = load256(ptr + %d);" % (iter * 32)
        print "        val_0_lo = and256(val_0, lomask);"
        print "        val_0_hi = rshift4x64(val_0, 4);"
        print "        val_0_hi = and256(val_0_hi, lomask);"
        print "        res_%d = and256(vpshufb(maskLo  , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
        if cautious:
            print "        res_%d = and256(res_%d, p_mask);" % (iter, iter)
    def produce_code(self):
        print self.produce_header(visible = True, header_only = False)
        print self.produce_common_declarations()
        print
        self.produce_needed_temporaries(self.num_iterations)
        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
        print "    const m256 maskLo = set2x128(maskBase[0]);"
        print "    const m256 maskHi = set2x128(maskBase[1]);"
        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
        print "    const size_t iterBytes = %d;" % (self.num_iterations * 32)
        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
                                ' buf, len, a->start_offset);'
        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
                                ' mainStart);'
        print "    const m256 lomask = set32x8(0xf);"
        print "    if (ptr < mainStart) {"
        print "        ptr = mainStart - 32;"
        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
        print "        ptr += 32;"
        print "    }"
        print "    if (ptr + 32 < buf + len) {"
        self.produce_one_iteration_state_calc(iter = 0, cautious = False)
        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
        print "        ptr += 32;"
        print "    }"
        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
        print "        __builtin_prefetch(ptr + (iterBytes*4));"
        print self.produce_flood_check()
        for iter in range (0, self.num_iterations):
            self.produce_one_iteration_state_calc(iter = iter, cautious = False)
        print "        arrCnt = 0;"
        for iter in range (0, self.num_iterations):
            self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
        self.produce_confirm(cautious = False)
        print "    }"
        print "    for (; ptr < buf + len; ptr += 32) {"
        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
        print "    }"
        print self.produce_footer()
    def get_name(self):
        if self.packed:
            pck_string = "_pck"
        else:
            pck_string = ""
        return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
    def produce_compile_call(self):
        packed_str = { False : "false", True : "true"}[self.packed]
        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
            self.conf_pull_back, self.conf_top_level_split)
    def __init__(self, arch, packed = False):
        self.arch = arch
        self.packed = packed
        self.num_masks = 1
        self.num_buckets = 8
        self.num_iterations = 2
        self.conf_top_level_split = 1
        self.conf_pull_back = 0
        if packed:
            self.conf_top_level_split = 32
        else:
            self.conf_top_level_split = 1
        self.conf_pull_back = 0
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,6 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_compile_internal.h"
 #include "fdr_confirm.h"
@ -65,7 +64,32 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const
    return false;
 }
-#include "teddy_autogen_compiler.cpp"
+void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
    static const TeddyEngineDef defns[] = {
        { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
        { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
        { 11, 0, 1, 8, false, 0, 1 },
        { 12, 0, 1, 8, true, 0, 32 },
        { 13, 0, 2, 8, false, 0, 1 },
        { 14, 0, 2, 8, true, 0, 32 },
        { 15, 0, 3, 8, false, 0, 1 },
        { 16, 0, 3, 8, true, 0, 32 },
        { 17, 0, 4, 8, false, 0, 1 },
        { 18, 0, 4, 8, true, 0, 32 },
    };
    out->clear();
    for (const auto &def : defns) {
        out->emplace_back(def);
    }
 }
 static
 size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@ -0,0 +1,256 @@
 /*
 * Copyright (c) 2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Teddy literal matcher: common runtime procedures.
 */
 #ifndef TEDDY_RUNTIME_COMMON_H_
 #define TEDDY_RUNTIME_COMMON_H_
 #include "fdr_confirm.h"
 #include "fdr_confirm_runtime.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
 #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
 #else
 #define TEDDY_CONF_TYPE u32
 #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
 #endif
 #define CHECK_HWLM_TERMINATE_MATCHING                                       \
 do {                                                                        \
    if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {                  \
        *a->groups = controlVal;                                            \
        return HWLM_TERMINATED;                                             \
    }                                                                       \
 } while (0);
 #define CHECK_FLOOD                                                         \
 do {                                                                        \
    if (unlikely(ptr > tryFloodDetect)) {                                   \
        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
                                     &floodBackoff, &controlVal,            \
                                     iterBytes);                            \
        CHECK_HWLM_TERMINATE_MATCHING;                                      \
    }                                                                       \
 } while (0);
 /*
 * \brief Copy a block of [0,15] bytes efficiently.
 *
 * This function is a workaround intended to stop some compilers from
 * synthesizing a memcpy function call out of the copy of a small number of
 * bytes that we do in vectoredLoad128.
 */
 static really_inline
 void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
    switch (len) {
    case 0:
        break;
    case 1:
        *dst = *src;
        break;
    case 2:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        break;
    case 3:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        dst[2] = src[2];
        break;
    case 4:
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 5:
    case 6:
    case 7:
        /* Perform copy with two overlapping 4-byte chunks. */
        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 8:
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    default:
        /* Perform copy with two overlapping 8-byte chunks. */
        assert(len < 16);
        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    }
 }
 // Note: p_mask is an output param that initialises a poison mask.
 static really_inline
 m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    union {
        u8 val8[16];
        m128 val128;
    } u;
    u.val128 = zeroes128();
    uintptr_t copy_start;
    uintptr_t copy_len;
    if (ptr >= lo) {
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 16) {
            *p_mask = load128(p_mask_arr[16] + 16);
            return loadu128(ptr);
        }
        *p_mask = load128(p_mask_arr[avail] + 16);
        copy_start = 0;
        copy_len = avail;
    } else {
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(len_history, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        uintptr_t i;
        for (i = start - need; ptr + i < lo; i++) {
            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
        }
        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
        copy_start = i;
        copy_len = end - i;
    }
    // Runt block from the buffer.
    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
    return u.val128;
 }
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                CautionReason reason) {
    u64a confVal = 0;
    const u8 *buf = a->buf;
    size_t len = a->len;
    const u8 *confirm_loc = ptr + byte - 7;
    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
        confVal = lv_u64a(confirm_loc, buf, buf + len);
    } else { // r == VECTORING, confirm_loc < buf
        u64a histBytes = a->histBytes;
        confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
        // stitch together confVal and history
        u32 overhang = buf - confirm_loc;
        histBytes >>= 64 - (overhang * 8);
        confVal |= histBytes;
    }
    return confVal;
 }
 static really_inline
 void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
                          const u32 *confBase, CautionReason reason,
                          const struct FDR_Runtime_Args *a, const u8 *ptr,
                          hwlmcb_rv_t *control, u32 *last_match) {
    do  {
        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem  = bit % bucket;
        u32 confSplit = *(ptr+byte) & 0x1f;
        u32 idx = confSplit * bucket + bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = getConfVal(a, ptr, byte, reason);
        confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
                    last_match, confVal);
    } while (unlikely(*conf));
 }
 static really_inline
 void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
                           const u32 *confBase, CautionReason reason,
                           const struct FDR_Runtime_Args *a, const u8 *ptr,
                           hwlmcb_rv_t *control, u32 *last_match) {
    do {
        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
        u32 byte = bit / bucket + offset;
        u32 idx  = bit % bucket;
        u32 cf = confBase[idx];
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = getConfVal(a, ptr, byte, reason);
        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
                     confVal);
    } while (unlikely(*conf));
 }
 static really_inline
 void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
                              const u32 *confBase, CautionReason reason,
                              const struct FDR_Runtime_Args *a, const u8 *ptr,
                              hwlmcb_rv_t *control, u32 *last_match) {
    do {
        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
        u32 byte = bit / bucket + offset;
        u32 idx = bit % bucket;
        u32 cf = confBase[idx];
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = getConfVal(a, ptr, byte, reason);
        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
                        last_match, confVal);
    } while (unlikely(*conf));
 }
 static really_inline
 const m128 * getMaskBase(const struct Teddy *teddy) {
    return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
 }
 static really_inline
 const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
                         (numMask*32));
 }
 #endif /* TEDDY_RUNTIME_COMMON_H_ */
--- a/src/grey.cpp
+++ b/src/grey.cpp
@ -54,7 +54,6 @@ Grey::Grey(void) :
                   allowRose(true),
                   allowExtendedNFA(true), /* bounded repeats of course */
                   allowLimExNFA(true),
                   allowSidecar(true),
                   allowAnchoredAcyclic(true),
                   allowSmallLiteralSet(true),
                   allowCastle(true),
@ -207,7 +206,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(allowRose);
        G_UPDATE(allowExtendedNFA);
        G_UPDATE(allowLimExNFA);
        G_UPDATE(allowSidecar);
        G_UPDATE(allowAnchoredAcyclic);
        G_UPDATE(allowSmallLiteralSet);
        G_UPDATE(allowCastle);
--- a/src/grey.h
+++ b/src/grey.h
@ -54,7 +54,6 @@ struct Grey {
    bool allowRose;
    bool allowExtendedNFA;
    bool allowLimExNFA;
    bool allowSidecar;
    bool allowAnchoredAcyclic;
    bool allowSmallLiteralSet;
    bool allowCastle;
--- a/src/hs.cpp
+++ b/src/hs.cpp
@ -39,6 +39,7 @@
 #include "compiler/error.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_expr_info.h"
 #include "nfagraph/ng_extparam.h"
 #include "parser/parse_error.h"
 #include "parser/Parser.h"
 #include "parser/prefilter.h"
@ -310,7 +311,8 @@ hs_error_t hs_compile_ext_multi(const char * const *expressions,
 static
 hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
-                                  unsigned int mode, hs_expr_info_t **info,
+                                  const hs_expr_ext_t *ext, unsigned int mode,
                                  hs_expr_info_t **info,
                                  hs_compile_error_t **error) {
    if (!error) {
        // nowhere to write an error, but we can still return an error code.
@ -347,7 +349,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
        }
        ReportManager rm(cc.grey);
-        ParsedExpression pe(0, expression, flags, 0);
+        ParsedExpression pe(0, expression, flags, 0, ext);
        assert(pe.component);
        // Apply prefiltering transformations if desired.
@ -362,6 +364,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
            throw ParseError("Internal error.");
        }
        handleExtendedParams(rm, *g, cc);
        fillExpressionInfo(rm, *g, &local_info);
    }
    catch (const CompileError &e) {
@ -394,7 +397,16 @@ extern "C" HS_PUBLIC_API
 hs_error_t hs_expression_info(const char *expression, unsigned int flags,
                              hs_expr_info_t **info,
                              hs_compile_error_t **error) {
-    return hs_expression_info_int(expression, flags, HS_MODE_BLOCK, info,
+    return hs_expression_info_int(expression, flags, nullptr, HS_MODE_BLOCK,
                                  info, error);
 }
 extern "C" HS_PUBLIC_API
 hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
                                  const hs_expr_ext_t *ext,
                                  hs_expr_info_t **info,
                                  hs_compile_error_t **error) {
    return hs_expression_info_int(expression, flags, ext, HS_MODE_BLOCK, info,
                                  error);
 }
--- a/src/hs_common.h
+++ b/src/hs_common.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -500,6 +500,25 @@ const char *hs_version(void);
 */
 #define HS_BAD_ALLOC            (-9)
 /**
 * The scratch region was already in use.
 *
 * This error is returned when Hyperscan is able to detect that the scratch
 * region given is already in use by another Hyperscan API call.
 *
 * A separate scratch region, allocated with @ref hs_alloc_scratch() or @ref
 * hs_clone_scratch(), is required for every concurrent caller of the Hyperscan
 * API.
 *
 * For example, this error might be returned when @ref hs_scan() has been
 * called inside a callback delivered by a currently-executing @ref hs_scan()
 * call using the same scratch region.
 *
 * Note: Not all concurrent uses of scratch regions may be detected. This error
 * is intended as a best-effort debugging tool, not a guarantee.
 */
 #define HS_SCRATCH_IN_USE       (-10)
 /** @} */
 #ifdef __cplusplus
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@ -158,7 +158,7 @@ typedef struct hs_platform_info {
 /**
 * A type containing information related to an expression that is returned by
- * @ref hs_expression_info().
+ * @ref hs_expression_info() or @ref hs_expression_ext_info.
 */
 typedef struct hs_expr_info {
    /**
@ -201,7 +201,8 @@ typedef struct hs_expr_info {
 /**
 * A structure containing additional parameters related to an expression,
- * passed in at build time to @ref hs_compile_ext_multi().
+ * passed in at build time to @ref hs_compile_ext_multi() or @ref
 * hs_expression_ext_info.
 *
 * These parameters allow the set of matches produced by a pattern to be
 * constrained at compile time, rather than relying on the application to
@ -401,7 +402,7 @@ hs_error_t hs_compile_multi(const char *const *expressions,
                            hs_database_t **db, hs_compile_error_t **error);
 /**
- * The multiple regular expression compiler with extended pattern support.
+ * The multiple regular expression compiler with extended parameter support.
 *
 * This function call compiles a group of expressions into a database in the
 * same way as @ref hs_compile_multi(), but allows additional parameters to be
@ -550,6 +551,62 @@ hs_error_t hs_expression_info(const char *expression, unsigned int flags,
                              hs_expr_info_t **info,
                              hs_compile_error_t **error);
 /**
 * Utility function providing information about a regular expression, with
 * extended parameter support. The information provided in @ref hs_expr_info_t
 * includes the minimum and maximum width of a pattern match.
 *
 * @param expression
 *      The NULL-terminated expression to parse. Note that this string must
 *      represent ONLY the pattern to be matched, with no delimiters or flags;
 *      any global flags should be specified with the @a flags argument.  For
 *      example, the expression `/abc?def/i` should be compiled by providing
 *      `abc?def` as the @a expression, and @ref HS_FLAG_CASELESS as the @a
 *      flags.
 *
 * @param flags
 *      Flags which modify the behaviour of the expression. Multiple flags may
 *      be used by ORing them together. Valid values are:
 *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
 *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
 *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
 *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
 *                               expression per stream.
 *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
 *                              empty string, such as `.*`.
 *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
 *       - HS_FLAG_UCP - Use Unicode properties for character classes.
 *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
 *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
 *                                when a match is found.
 *
 * @param ext
 *      A pointer to a filled @ref hs_expr_ext_t structure that defines
 *      extended behaviour for this pattern. NULL may be specified if no
 *      extended parameters are needed.
 *
 * @param info
 *      On success, a pointer to the pattern information will be returned in
 *      this parameter, or NULL on failure. This structure is allocated using
 *      the allocator supplied in @ref hs_set_allocator() (or malloc() if no
 *      allocator was set) and should be freed by the caller.
 *
 * @param error
 *      If the call fails, a pointer to a @ref hs_compile_error_t will be
 *      returned, providing details of the error condition. The caller is
 *      responsible for deallocating the buffer using the @ref
 *      hs_free_compile_error() function.
 *
 * @return
 *      @ref HS_SUCCESS is returned on successful compilation; @ref
 *      HS_COMPILER_ERROR on failure, with details provided in the error
 *      parameter.
 */
 hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
                                  const hs_expr_ext_t *ext,
                                  hs_expr_info_t **info,
                                  hs_compile_error_t **error);
 /**
 * Populates the platform information based on the current host.
 *
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,7 +37,6 @@
 #include "noodle_build.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
 #include "fdr/fdr.h"
 #include "nfa/shufticompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
@ -526,8 +525,7 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
        DEBUG_PRINTF("build noodle table\n");
        engType = HWLM_ENGINE_NOOD;
        const hwlmLiteral &lit = lits.front();
-        auto noodle = noodBuildTable((const u8 *)lit.s.c_str(), lit.s.length(),
+        auto noodle = noodBuildTable(lit);
                                     lit.nocase, lit.id);
        if (noodle) {
            engSize = noodSize(noodle.get());
        }
--- a/src/hwlm/hwlm_literal.cpp
+++ b/src/hwlm/hwlm_literal.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,13 +34,11 @@
 #include "util/compare.h" // for ourisalpha
 #include "util/ue2string.h" // for escapeString
 #include <algorithm>
 #include <iomanip>
 #include <sstream>
 #include <boost/algorithm/cxx11/all_of.hpp>
 using namespace std;
 using namespace boost::algorithm;
 namespace ue2 {
@ -91,10 +89,17 @@ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
    assert(msk.size() <= HWLM_MASKLEN);
    assert(msk.size() == cmp.size());
-    DEBUG_PRINTF("literal '%s', msk=%s, cmp=%s\n",
+    // If we've been handled a nocase literal, all letter characters must be
-                 escapeString(s).c_str(), dumpMask(msk).c_str(),
+    // upper-case.
    if (nocase) {
        upperString(s);
    }
    DEBUG_PRINTF("literal '%s'%s, msk=%s, cmp=%s\n", escapeString(s).c_str(),
                 nocase ? " (nocase)" : "", dumpMask(msk).c_str(),
                 dumpMask(cmp).c_str());
    // Mask and compare vectors MUST be the same size.
    assert(msk.size() == cmp.size());
@ -102,7 +107,7 @@ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
    assert(maskIsConsistent(s, nocase, msk, cmp));
    // In the name of good hygiene, zap msk/cmp if msk is all zeroes.
-    if (all_of_equal(msk.begin(), msk.end(), 0)) {
+    if (all_of(begin(msk), end(msk), [](u8 val) { return val == 0; })) {
        msk.clear();
        cmp.clear();
    }
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -95,11 +95,6 @@ struct hwlmLiteral {
     */
    std::vector<u8> cmp;
    /** \brief Simple constructor: no group information, no msk/cmp. */
    hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
        : s(s_in), id(id_in), nocase(nocase_in), noruns(false),
          groups(HWLM_ALL_GROUPS), msk(0), cmp(0) {}
    /** \brief Complete constructor, takes group information and msk/cmp.
     *
     * This constructor takes a msk/cmp pair. Both must be vectors of length <=
@ -107,6 +102,10 @@ struct hwlmLiteral {
    hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in,
                u32 id_in, hwlm_group_t groups_in,
                const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in);
    /** \brief Simple constructor: no group information, no msk/cmp. */
    hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
        : hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {}
 };
 /**
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@ -26,28 +26,35 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
-/** \file
+/**
 * \file
 * \brief Noodle literal matcher: build code.
 */
 #include <cstring> // for memcpy
 #include "noodle_build.h"
 #include "hwlm_literal.h"
 #include "noodle_internal.h"
 #include "ue2common.h"
 #include "util/alloc.h"
 #include "util/compare.h"
 #include "util/verify_types.h"
 #include "ue2common.h"
 #include <cstring> // for memcpy
 namespace ue2 {
 static
-size_t findNoodFragOffset(const u8 *lit, size_t len, bool nocase) {
+size_t findNoodFragOffset(const hwlmLiteral &lit) {
    const auto &s = lit.s;
    const size_t len = lit.s.length();
    size_t offset = 0;
    for (size_t i = 0; i + 1 < len; i++) {
        int diff = 0;
-        const char c = lit[i];
+        const char c = s[i];
-        const char d = lit[i + 1];
+        const char d = s[i + 1];
-        if (nocase && ourisalpha(c)) {
+        if (lit.nocase && ourisalpha(c)) {
            diff = (mytoupper(c) != mytoupper(d));
        } else {
            diff = (c != d);
@ -60,21 +67,24 @@ size_t findNoodFragOffset(const u8 *lit, size_t len, bool nocase) {
    return offset;
 }
-/** \brief Construct a Noodle matcher for the given literal. */
+aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
-aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len,
+    if (!lit.msk.empty()) {
-                                             bool nocase, u32 id) {
+        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
-    size_t noodle_len = sizeof(noodTable) + len;
+        return nullptr;
-    aligned_unique_ptr<noodTable> n =
+    }
-        aligned_zmalloc_unique<noodTable>(noodle_len);
+
    const auto &s = lit.s;
    size_t noodle_len = sizeof(noodTable) + s.length();
    auto n = aligned_zmalloc_unique<noodTable>(noodle_len);
    assert(n);
-    size_t key_offset = findNoodFragOffset(lit, len, nocase);
+    size_t key_offset = findNoodFragOffset(lit);
-    n->id = id;
+    n->id = lit.id;
-    n->len = verify_u32(len);
+    n->len = verify_u32(s.length());
    n->key_offset = verify_u32(key_offset);
-    n->nocase = nocase ? 1 : 0;
+    n->nocase = lit.nocase ? 1 : 0;
-    memcpy(n->str, lit, len);
+    memcpy(n->str, s.c_str(), s.length());
    return n;
 }
--- a/src/hwlm/noodle_build.h
+++ b/src/hwlm/noodle_build.h
@ -40,9 +40,10 @@ struct noodTable;
 namespace ue2 {
 struct hwlmLiteral;
 /** \brief Construct a Noodle matcher for the given literal. */
-ue2::aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len,
+ue2::aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);
                                                  bool nocase, u32 id);
 size_t noodSize(const noodTable *n);
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,6 +37,7 @@
 #include "util/compare.h"
 #include "util/masked_move.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 #include <ctype.h>
 #include <stdbool.h>
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -221,7 +221,7 @@ hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
        u32 z0 = movemask256(eq256(mask1, v));
        u32 z1 = movemask256(eq256(mask2, v));
        u32 z = (lastz0 | (z0 << 1)) & z1;
-        lastz0 = (z0 & 0x80000000) >> 31;
+        lastz0 = z0 >> 31;
        // On large packet buffers, this prefetch appears to get us about 2%.
        __builtin_prefetch(d + 128);
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -190,8 +190,8 @@ hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
        m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
        m128 z1 = eq128(mask1, v);
        m128 z2 = eq128(mask2, v);
-        u32 z = movemask128(and128(or128(lastz1, shiftLeft8Bits(z1)), z2));
+        u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
-        lastz1 = _mm_srli_si128(z1, 15);
+        lastz1 = z1;
        // On large packet buffers, this prefetch appears to get us about 2%.
        __builtin_prefetch(d + 128);
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,9 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "vermicelli.h"
 #include "multishufti.h"
 #include "multitruffle.h"
 #include "multivermicelli.h"
 #include "ue2common.h"
 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@ -81,6 +84,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
                                  c_end - 1);
        break;
    case ACCEL_DVERM_MASKED:
        DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
        if (c + 16 + 1 >= c_end) {
            return c;
        }
        /* need to stop one early to get an accurate end state */
        rv = vermicelliDoubleMaskedExec(accel->dverm.c1, accel->dverm.c2,
                                        accel->dverm.m1, accel->dverm.m2,
                                        c, c_end - 1);
        break;
    case ACCEL_SHUFTI:
        DEBUG_PRINTF("accel shufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
@ -117,6 +132,221 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
        rv = c_end;
        break;
    /* multibyte matchers */
    case ACCEL_MLVERM:
        DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLVERM_NOCASE:
        DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLGVERM:
        DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLGVERM_NOCASE:
        DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSVERM:
        DEBUG_PRINTF("accel msverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSVERM_NOCASE:
        DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSGVERM:
        DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSGVERM_NOCASE:
        DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MDSVERM:
        DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end,
                                        accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSVERM_NOCASE:
        DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end,
                                        accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSGVERM:
        DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end,
                                            accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSGVERM_NOCASE:
        DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end,
                                            accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MLSHUFTI:
        DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                             accel->mshufti.len);
        break;
    case ACCEL_MLGSHUFTI:
        DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                                 accel->mshufti.len);
        break;
    case ACCEL_MSSHUFTI:
        DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                              accel->mshufti.len);
        break;
    case ACCEL_MSGSHUFTI:
        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                                  accel->mshufti.len);
        break;
    case ACCEL_MDSSHUFTI:
        DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
                                     accel->mdshufti.len1, accel->mdshufti.len2);
        break;
    case ACCEL_MDSGSHUFTI:
        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
                                         accel->mdshufti.len1, accel->mdshufti.len2);
        break;
    case ACCEL_MLTRUFFLE:
        DEBUG_PRINTF("accel mltruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                               c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MLGTRUFFLE:
        DEBUG_PRINTF("accel mlgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                                   c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MSTRUFFLE:
        DEBUG_PRINTF("accel mstruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                               c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MSGTRUFFLE:
        DEBUG_PRINTF("accel msgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                                   c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MDSTRUFFLE:
        DEBUG_PRINTF("accel mdstruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_truffleExec(accel->mdtruffle.mask1,
                                     accel->mdtruffle.mask2, c, c_end,
                                     accel->mdtruffle.len1,
                                     accel->mdtruffle.len2);
        break;
    case ACCEL_MDSGTRUFFLE:
        DEBUG_PRINTF("accel mdsgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_truffleExec(accel->mdtruffle.mask1,
                                         accel->mdtruffle.mask2, c, c_end,
                                         accel->mdtruffle.len1,
                                         accel->mdtruffle.len2);
        break;
    default:
        assert(!"not here");
        return c;
@ -127,5 +357,7 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
    rv = MAX(c + accel->generic.offset, rv);
    rv -= accel->generic.offset;
    DEBUG_PRINTF("advanced %zd\n", rv - c);
    return rv;
 }
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -60,7 +60,37 @@ enum AccelType {
    ACCEL_SHUFTI,
    ACCEL_DSHUFTI,
    ACCEL_TRUFFLE,
-    ACCEL_RED_TAPE
+    ACCEL_RED_TAPE,
    /* multibyte vermicellis */
    ACCEL_MLVERM,
    ACCEL_MLVERM_NOCASE,
    ACCEL_MLGVERM,
    ACCEL_MLGVERM_NOCASE,
    ACCEL_MSVERM,
    ACCEL_MSVERM_NOCASE,
    ACCEL_MSGVERM,
    ACCEL_MSGVERM_NOCASE,
    ACCEL_MDSVERM,
    ACCEL_MDSVERM_NOCASE,
    ACCEL_MDSGVERM,
    ACCEL_MDSGVERM_NOCASE,
    /* multibyte shuftis */
    ACCEL_MLSHUFTI,
    ACCEL_MLGSHUFTI,
    ACCEL_MSSHUFTI,
    ACCEL_MSGSHUFTI,
    ACCEL_MDSSHUFTI,
    ACCEL_MDSGSHUFTI,
    /* multibyte truffles */
    ACCEL_MLTRUFFLE,
    ACCEL_MLGTRUFFLE,
    ACCEL_MSTRUFFLE,
    ACCEL_MSGTRUFFLE,
    ACCEL_MDSTRUFFLE,
    ACCEL_MDSGTRUFFLE,
    /* masked dverm */
    ACCEL_DVERM_MASKED,
 };
 /** \brief Structure for accel framework. */
@ -80,7 +110,22 @@ union AccelAux {
        u8 offset;
        u8 c1; // uppercase if nocase
        u8 c2; // uppercase if nocase
        u8 m1; // masked variant
        u8 m2; // masked variant
    } dverm;
    struct {
        u8 accel_type;
        u8 offset;
        u8 c; // uppercase if nocase
        u8 len;
    } mverm;
    struct {
        u8 accel_type;
        u8 offset;
        u8 c; // uppercase if nocase
        u8 len1;
        u8 len2;
    } mdverm;
    struct {
        u8 accel_type;
        u8 offset;
@ -95,12 +140,42 @@ union AccelAux {
        m128 lo2;
        m128 hi2;
    } dshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 lo;
        m128 hi;
        u8 len;
    } mshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 lo;
        m128 hi;
        u8 len1;
        u8 len2;
    } mdshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
    } truffle;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
        u8 len;
    } mtruffle;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
        u8 len1;
        u8 len2;
    } mdtruffle;
 };
 /**
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,16 +37,21 @@
 #include "shufticompile.h"
 #include "trufflecompile.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
 #include "util/dump_mask.h"
 #include "util/simd_utils.h"
 #include <cstdio>
 #include <vector>
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
 using namespace std;
 namespace ue2 {
 static
@ -62,6 +67,8 @@ const char *accelName(u8 accel_type) {
        return "double-vermicelli";
    case ACCEL_DVERM_NOCASE:
        return "double-vermicelli nocase";
    case ACCEL_DVERM_MASKED:
        return "double-vermicelli masked";
    case ACCEL_RVERM:
        return "reverse vermicelli";
    case ACCEL_RVERM_NOCASE:
@ -86,11 +93,144 @@ const char *accelName(u8 accel_type) {
        return "truffle";
    case ACCEL_RED_TAPE:
        return "red tape";
    case ACCEL_MLVERM:
        return "multibyte long vermicelli";
    case ACCEL_MLVERM_NOCASE:
        return "multibyte long vermicelli nocase";
    case ACCEL_MLGVERM:
        return "multibyte long-grab vermicelli";
    case ACCEL_MLGVERM_NOCASE:
        return "multibyte long-grab vermicelli nocase";
    case ACCEL_MSVERM:
        return "multibyte shift vermicelli";
    case ACCEL_MSVERM_NOCASE:
        return "multibyte shift vermicelli nocase";
    case ACCEL_MSGVERM:
        return "multibyte shift-grab vermicelli";
    case ACCEL_MSGVERM_NOCASE:
        return "multibyte shift-grab vermicelli nocase";
    case ACCEL_MDSVERM:
        return "multibyte doubleshift vermicelli";
    case ACCEL_MDSVERM_NOCASE:
        return "multibyte doubleshift vermicelli nocase";
    case ACCEL_MDSGVERM:
        return "multibyte doubleshift-grab vermicelli";
    case ACCEL_MDSGVERM_NOCASE:
        return "multibyte doubleshift-grab vermicelli nocase";
    case ACCEL_MLSHUFTI:
        return "multibyte long shufti";
    case ACCEL_MLGSHUFTI:
        return "multibyte long-grab shufti";
    case ACCEL_MSSHUFTI:
        return "multibyte shift shufti";
    case ACCEL_MSGSHUFTI:
        return "multibyte shift-grab shufti";
    case ACCEL_MDSSHUFTI:
        return "multibyte doubleshift shufti";
    case ACCEL_MDSGSHUFTI:
        return "multibyte doubleshift-grab shufti";
    case ACCEL_MLTRUFFLE:
        return "multibyte long truffle";
    case ACCEL_MLGTRUFFLE:
        return "multibyte long-grab truffle";
    case ACCEL_MSTRUFFLE:
        return "multibyte shift truffle";
    case ACCEL_MSGTRUFFLE:
        return "multibyte shift-grab truffle";
    case ACCEL_MDSTRUFFLE:
        return "multibyte doubleshift truffle";
    case ACCEL_MDSGTRUFFLE:
        return "multibyte doubleshift-grab truffle";
    default:
        return "unknown!";
    }
 }
 static
 void dumpShuftiCharReach(FILE *f, const m128 &lo, const m128 &hi) {
    CharReach cr = shufti2cr(lo, hi);
    fprintf(f, "count %zu class %s\n", cr.count(),
            describeClass(cr).c_str());
 }
 static
 vector<CharReach> shufti2cr_array(const m128 lo_in, const m128 hi_in) {
    const u8 *lo = (const u8 *)&lo_in;
    const u8 *hi = (const u8 *)&hi_in;
    vector<CharReach> crs(8);
    for (u32 i = 0; i < 256; i++) {
        u32 combined = lo[(u8)i & 0xf] & hi[(u8)i >> 4];
        while (combined) {
            u32 j = findAndClearLSB_32(&combined);
            crs.at(j).set(i);
        }
    }
    return crs;
 }
 static
 void dumpDShuftiCharReach(FILE *f, const m128 &lo1, const m128 &hi1,
                                   const m128 &lo2, const m128 &hi2) {
    vector<CharReach> cr1 = shufti2cr_array(not128(lo1), not128(hi1));
    vector<CharReach> cr2 = shufti2cr_array(not128(lo2), not128(hi2));
    map<CharReach, set<u32> > cr1_group;
    assert(cr1.size() == 8 && cr2.size() == 8);
    for (u32 i = 0; i < 8; i++) {
        if (!cr1[i].any()) {
            continue;
        }
        cr1_group[cr1[i]].insert(i);
    }
    map<CharReach, CharReach> rev;
    for (const auto &e : cr1_group) {
        CharReach rhs;
        for (u32 r : e.second) {
            rhs |= cr2.at(r);
        }
        rev[rhs] |= e.first;
    }
    fprintf(f, "escapes: {");
    for (auto it = rev.begin(); it != rev.end(); ++it) {
        const auto &e = *it;
        if (it != rev.begin()) {
            fprintf(f, ", ");
        }
        if (e.first.all()) {
            fprintf(f, "%s", describeClass(e.second).c_str());
        } else {
            fprintf(f, "%s%s", describeClass(e.second).c_str(),
                    describeClass(e.first).c_str());
        }
    }
    fprintf(f, "}\n");
 }
 static
 void dumpShuftiMasks(FILE *f, const m128 &lo, const m128 &hi) {
    fprintf(f, "lo %s\n",
            dumpMask((const u8 *)&lo, 128).c_str());
    fprintf(f, "hi %s\n",
            dumpMask((const u8 *)&hi, 128).c_str());
 }
 static
 void dumpTruffleCharReach(FILE *f, const m128 &hiset, const m128 &hiclear) {
    CharReach cr = truffle2cr(hiset, hiclear);
    fprintf(f, "count %zu class %s\n", cr.count(),
            describeClass(cr).c_str());
 }
 static
 void dumpTruffleMasks(FILE *f, const m128 &hiset, const m128 &hiclear) {
    fprintf(f, "lo %s\n",
            dumpMask((const u8 *)&hiset, 128).c_str());
    fprintf(f, "hi %s\n",
            dumpMask((const u8 *)&hiclear, 128).c_str());
 }
 void dumpAccelInfo(FILE *f, const AccelAux &accel) {
    fprintf(f, " %s", accelName(accel.accel_type));
    if (accel.generic.offset) {
@ -110,39 +250,76 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
    case ACCEL_RDVERM_NOCASE:
        fprintf(f, " [\\x%02hhx\\x%02hhx]\n", accel.dverm.c1, accel.dverm.c2);
        break;
    case ACCEL_DVERM_MASKED:
        fprintf(f, " [\\x%02hhx\\x%02hhx] & [\\x%02hhx\\x%02hhx]\n",
                accel.dverm.c1, accel.dverm.c2, accel.dverm.m1, accel.dverm.m2);
        break;
    case ACCEL_SHUFTI: {
        fprintf(f, "\n");
-        fprintf(f, "lo %s\n",
+        dumpShuftiMasks(f, accel.shufti.lo, accel.shufti.hi);
-                dumpMask((const u8 *)&accel.shufti.lo, 128).c_str());
+        dumpShuftiCharReach(f, accel.shufti.lo, accel.shufti.hi);
        fprintf(f, "hi %s\n",
                dumpMask((const u8 *)&accel.shufti.hi, 128).c_str());
        CharReach cr = shufti2cr(accel.shufti.lo, accel.shufti.hi);
        fprintf(f, "count %zu class %s\n", cr.count(),
                describeClass(cr).c_str());
        break;
    }
    case ACCEL_DSHUFTI:
        fprintf(f, "\n");
-        fprintf(f, "lo1 %s\n",
+        fprintf(f, "mask 1\n");
-                dumpMask((const u8 *)&accel.dshufti.lo1, 128).c_str());
+        dumpShuftiMasks(f, accel.dshufti.lo1, accel.dshufti.hi1);
-        fprintf(f, "hi1 %s\n",
+        fprintf(f, "mask 2\n");
-                dumpMask((const u8 *)&accel.dshufti.hi1, 128).c_str());
+        dumpShuftiMasks(f, accel.dshufti.lo2, accel.dshufti.hi2);
-        fprintf(f, "lo2 %s\n",
+        dumpDShuftiCharReach(f, accel.dshufti.lo1, accel.dshufti.hi1,
-                dumpMask((const u8 *)&accel.dshufti.lo2, 128).c_str());
+                             accel.dshufti.lo2, accel.dshufti.hi2);
        fprintf(f, "hi2 %s\n",
                dumpMask((const u8 *)&accel.dshufti.hi2, 128).c_str());
        break;
    case ACCEL_TRUFFLE: {
        fprintf(f, "\n");
-        fprintf(f, "lo %s\n",
+        dumpTruffleMasks(f, accel.truffle.mask1, accel.truffle.mask2);
-                dumpMask((const u8 *)&accel.truffle.mask1, 128).c_str());
+        dumpTruffleCharReach(f, accel.truffle.mask1, accel.truffle.mask2);
        fprintf(f, "hi %s\n",
                dumpMask((const u8 *)&accel.truffle.mask2, 128).c_str());
        CharReach cr = truffle2cr(accel.truffle.mask1, accel.truffle.mask2);
        fprintf(f, "count %zu class %s\n", cr.count(),
                describeClass(cr).c_str());
        break;
    }
    case ACCEL_MLVERM:
    case ACCEL_MLVERM_NOCASE:
    case ACCEL_MLGVERM:
    case ACCEL_MLGVERM_NOCASE:
    case ACCEL_MSVERM:
    case ACCEL_MSVERM_NOCASE:
    case ACCEL_MSGVERM:
    case ACCEL_MSGVERM_NOCASE:
        fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len);
        break;
    case ACCEL_MDSVERM:
    case ACCEL_MDSVERM_NOCASE:
    case ACCEL_MDSGVERM:
    case ACCEL_MDSGVERM_NOCASE:
        fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1,
                accel.mdverm.len2);
        break;
    case ACCEL_MLSHUFTI:
    case ACCEL_MLGSHUFTI:
    case ACCEL_MSSHUFTI:
    case ACCEL_MSGSHUFTI:
        fprintf(f, " len:%u\n", accel.mshufti.len);
        dumpShuftiMasks(f, accel.mshufti.lo, accel.mshufti.hi);
        dumpShuftiCharReach(f, accel.mshufti.lo, accel.mshufti.hi);
        break;
    case ACCEL_MDSSHUFTI:
    case ACCEL_MDSGSHUFTI:
        fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
        dumpShuftiMasks(f, accel.mdshufti.lo, accel.mdshufti.hi);
        dumpShuftiCharReach(f, accel.mdshufti.lo, accel.mdshufti.hi);
        break;
    case ACCEL_MLTRUFFLE:
    case ACCEL_MLGTRUFFLE:
    case ACCEL_MSTRUFFLE:
    case ACCEL_MSGTRUFFLE:
        fprintf(f, " len:%u\n", accel.mtruffle.len);
        dumpTruffleMasks(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
        dumpTruffleCharReach(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
        break;
    case ACCEL_MDSTRUFFLE:
    case ACCEL_MDSGTRUFFLE:
        fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
        dumpTruffleMasks(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
        dumpTruffleCharReach(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
        break;
    default:
        fprintf(f, "\n");
        break;
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -94,6 +94,47 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
    DEBUG_PRINTF("unable to accelerate case with %zu outs\n", outs);
 }
 bool buildDvermMask(const flat_set<pair<u8, u8>> &escape_set, u8 *m1_out,
                    u8 *m2_out) {
    u8 a1 = 0xff;
    u8 a2 = 0xff;
    u8 b1 = 0xff;
    u8 b2 = 0xff;
    for (const auto &e : escape_set) {
        DEBUG_PRINTF("%0hhx %0hhx\n", e.first, e.second);
        a1 &= e.first;
        b1 &= ~e.first;
        a2 &= e.second;
        b2 &= ~e.second;
    }
    u8 m1 = a1 | b1;
    u8 m2 = a2 | b2;
    u32 holes1 = 8 - popcount32(m1);
    u32 holes2 = 8 - popcount32(m2);
    DEBUG_PRINTF("aaaa %0hhx %0hhx\n", a1, a2);
    DEBUG_PRINTF("bbbb %0hhx %0hhx\n", b1, b2);
    DEBUG_PRINTF("mask %0hhx %0hhx\n", m1, m2);
    assert(holes1 <= 8 && holes2 <= 8);
    assert(escape_set.size() <= 1U << (holes1 + holes2));
    if (escape_set.size() != 1U << (holes1 + holes2)) {
        return false;
    }
    if (m1_out) {
        *m1_out = m1;
    }
    if (m2_out) {
        *m2_out = m2;
    }
    return true;
 }
 static
 bool isCaselessDouble(const flat_set<pair<u8, u8>> &stop) {
    // test for vector containing <A,Z> <A,z> <a,Z> <a,z>
@ -149,17 +190,31 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
        return;
    }
-    if (outs1 + outs2 <= 8) {
+    if (outs1 == 0) {
-        if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.
+        u8 m1;
-            DEBUG_PRINTF("building double-shufti for %zu one-byte and %zu"
+        u8 m2;
-                         " two-byte literals\n", outs1, outs2);
+
-            aux->accel_type = ACCEL_DSHUFTI;
+        if (buildDvermMask(info.double_stop2, &m1, &m2)) {
-            aux->dshufti.offset = offset;
+            aux->accel_type = ACCEL_DVERM_MASKED;
-            shuftiBuildDoubleMasks(info.double_stop1, info.double_stop2,
+            aux->dverm.offset = offset;
-                                   &aux->dshufti.lo1,
+            aux->dverm.c1 = info.double_stop2.begin()->first & m1;
-                                   &aux->dshufti.hi1,
+            aux->dverm.c2 = info.double_stop2.begin()->second & m2;
-                                   &aux->dshufti.lo2,
+            aux->dverm.m1 = m1;
-                                   &aux->dshufti.hi2);
+            aux->dverm.m2 = m2;
            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
                         aux->dverm.c1, aux->dverm.c2);
            return;
        }
    }
    if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.
        DEBUG_PRINTF("building double-shufti for %zu one-byte and %zu"
                     " two-byte literals\n", outs1, outs2);
        aux->accel_type = ACCEL_DSHUFTI;
        aux->dshufti.offset = offset;
        if (shuftiBuildDoubleMasks(info.double_stop1, info.double_stop2,
                                   &aux->dshufti.lo1, &aux->dshufti.hi1,
                                   &aux->dshufti.lo2, &aux->dshufti.hi2)) {
            return;
        }
    }
@ -169,13 +224,285 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
    aux->accel_type = ACCEL_NONE;
 }
 static
 void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
    if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
        DEBUG_PRINTF("no multimatch for us :(");
        return;
    }
    u32 offset = info.multiaccel_offset;
    const CharReach &stops = info.multiaccel_stops;
    assert(aux->accel_type == ACCEL_NONE);
    if (stops.all()) {
        return;
    }
    size_t outs = stops.count();
    DEBUG_PRINTF("%zu outs\n", outs);
    assert(outs && outs < 256);
    switch (info.ma_type) {
    case MultibyteAccelInfo::MAT_LONG:
        if (outs == 1) {
            aux->accel_type = ACCEL_MLVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MLVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MLGVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MLGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFT:
        if (outs == 1) {
            aux->accel_type = ACCEL_MSVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MSVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MSGVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MSGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFT:
        if (outs == 1) {
            aux->accel_type = ACCEL_MDSVERM;
            aux->mdverm.offset = offset;
            aux->mdverm.c = stops.find_first();
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MDSVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MDSGVERM;
            aux->mdverm.offset = offset;
            aux->mdverm.c = stops.find_first();
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MDSGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    default:
        // shouldn't happen
        assert(0);
        return;
    }
    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
    switch (info.ma_type) {
    case MultibyteAccelInfo::MAT_LONG:
        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
                &aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MLSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
                &aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MLGSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_SHIFT:
        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
                &aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MSSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
                &aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MSGSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_DSHIFT:
        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
                &aux->mdshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MDSSHUFTI;
        aux->mdshufti.offset = offset;
        aux->mdshufti.len1 = info.ma_len1;
        aux->mdshufti.len2 = info.ma_len2;
        return;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
                &aux->mdshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MDSGSHUFTI;
        aux->mdshufti.offset = offset;
        aux->mdshufti.len1 = info.ma_len1;
        aux->mdshufti.len2 = info.ma_len2;
        return;
    default:
        // shouldn't happen
        assert(0);
        return;
    }
    DEBUG_PRINTF("shufti build failed, falling through\n");
    if (outs <= ACCEL_MAX_STOP_CHAR) {
        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
        switch (info.ma_type) {
        case MultibyteAccelInfo::MAT_LONG:
            aux->accel_type = ACCEL_MLTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_LONGGRAB:
            aux->accel_type = ACCEL_MLGTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_SHIFT:
            aux->accel_type = ACCEL_MSTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_SHIFTGRAB:
            aux->accel_type = ACCEL_MSGTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_DSHIFT:
            aux->accel_type = ACCEL_MDSTRUFFLE;
            aux->mdtruffle.offset = offset;
            aux->mdtruffle.len1 = info.ma_len1;
            aux->mdtruffle.len2 = info.ma_len2;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mdtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_DSHIFTGRAB:
            aux->accel_type = ACCEL_MDSGTRUFFLE;
            aux->mdtruffle.offset = offset;
            aux->mdtruffle.len1 = info.ma_len1;
            aux->mdtruffle.len2 = info.ma_len2;
            truffleBuildMasks(stops, &aux->mtruffle.mask1,
                              &aux->mdtruffle.mask2);
            break;
        default:
            // shouldn't happen
            assert(0);
            return;
        }
        return;
    }
    DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
 }
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
    assert(aux->accel_type == ACCEL_NONE);
    if (info.single_stops.none()) {
        DEBUG_PRINTF("picked red tape\n");
        aux->accel_type = ACCEL_RED_TAPE;
        aux->generic.offset = info.single_offset;
-    } else {
+    }
    if (aux->accel_type == ACCEL_NONE) {
        buildAccelMulti(info, aux);
    }
    if (aux->accel_type == ACCEL_NONE) {
        buildAccelDouble(info, aux);
    }
    if (aux->accel_type == ACCEL_NONE) {
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,9 +37,30 @@ union AccelAux;
 namespace ue2 {
 struct MultibyteAccelInfo {
    /* multibyte accel schemes, ordered by strength */
    enum multiaccel_type {
        MAT_SHIFT,
        MAT_SHIFTGRAB,
        MAT_DSHIFT,
        MAT_DSHIFTGRAB,
        MAT_LONG,
        MAT_LONGGRAB,
        MAT_MAX,
        MAT_NONE = MAT_MAX
    };
    CharReach cr;
    u32 offset = 0;
    u32 len1 = 0;
    u32 len2 = 0;
    multiaccel_type type = MAT_NONE;
 };
 struct AccelInfo {
    AccelInfo() : single_offset(0U), double_offset(0U),
-                  single_stops(CharReach::dot()) {}
+                  single_stops(CharReach::dot()),
                  multiaccel_offset(0), ma_len1(0), ma_len2(0),
                  ma_type(MultibyteAccelInfo::MAT_NONE) {}
    u32 single_offset; /**< offset correction to apply to single schemes */
    u32 double_offset; /**< offset correction to apply to double schemes */
    CharReach double_stop1;  /**<  single-byte accel stop literals for double
@ -47,10 +68,19 @@ struct AccelInfo {
    flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
                                               * literals */
    CharReach single_stops; /**< escapes for single byte acceleration */
    u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
    CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
    u32 ma_len1; /**< multiaccel len1 */
    u32 ma_len2; /**< multiaccel len2 */
    MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
 };
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
 /* returns true is the escape set can be handled with a masked double_verm */
 bool buildDvermMask(const flat_set<std::pair<u8, u8>> &escape_set,
                    u8 *m1_out = nullptr, u8 *m2_out = nullptr);
 } // namespace ue2
 #endif
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -96,7 +96,8 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
        repeatHasMatch(info, rctrl, rstate, offset);
    DEBUG_PRINTF("repeatHasMatch returned %d\n", match);
    if (match == REPEAT_MATCH) {
-        DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, subIdx);
+        DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset,
                     subIdx, sub->report);
        if (q->cb(offset, sub->report, q->context) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING;
        }
@ -111,17 +112,22 @@ int castleReportCurrent(const struct Castle *c, struct mq *q) {
    DEBUG_PRINTF("offset=%llu\n", offset);
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(q->streamState,
+        u8 *active = (u8 *)q->streamState;
-                                               c->activeIdxSize);
+        u8 *groups = active + c->groupIterOffset;
-        DEBUG_PRINTF("subcastle %u\n", activeIdx);
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
-        if (activeIdx < c->numRepeats && subCastleReportCurrent(c, q,
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-                offset, activeIdx) == MO_HALT_MATCHING) {
+            u8 *cur = active + i * c->activeIdxSize;
-            return MO_HALT_MATCHING;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            if (subCastleReportCurrent(c, q,
                    offset, activeIdx) == MO_HALT_MATCHING) {
                return MO_HALT_MATCHING;
            }
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        const u8 *active = (const u8 *)q->streamState + c->activeIdxSize;
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
            DEBUG_PRINTF("subcastle %u\n", i);
@ -162,11 +168,18 @@ static really_inline
 char castleInAccept(const struct Castle *c, struct mq *q,
                    const ReportID report, const u64a offset) {
    DEBUG_PRINTF("offset=%llu\n", offset);
     /* ignore when just catching up due to full queue */
    if (report == MO_INVALID_IDX) {
        return 0;
    }
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(q->streamState,
+        u8 *active = (u8 *)q->streamState;
-                                               c->activeIdxSize);
+        u8 *groups = active + c->groupIterOffset;
-        if (activeIdx < c->numRepeats) {
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
            u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            if (subCastleInAccept(c, q, report, offset, activeIdx)) {
                return 1;
@ -174,11 +187,10 @@ char castleInAccept(const struct Castle *c, struct mq *q,
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        const u8 *active = (const u8 *)q->streamState + c->activeIdxSize;
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
-             i != MMB_INVALID;
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
             i = mmbit_iterate(active, c->numRepeats, i)) {
            DEBUG_PRINTF("subcastle %u\n", i);
            if (subCastleInAccept(c, q, report, offset, i)) {
                return 1;
@ -193,7 +205,6 @@ static really_inline
 void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
                                  void *full_state, void *stream_state,
                                  const u32 subIdx) {
    u8 *active = (u8 *)stream_state;
    const struct SubCastle *sub = getSubCastle(c, subIdx);
    const struct RepeatInfo *info = getRepeatInfo(sub);
@ -203,10 +214,13 @@ void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
    if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
        DEBUG_PRINTF("sub %u is stale at offset %llu\n", subIdx, offset);
-        if (sub->exclusive) {
+        if (sub->exclusiveId < c->numRepeats) {
-            partial_store_u32(stream_state, c->numRepeats, c->activeIdxSize);
+            u8 *active = (u8 *)stream_state;
            u8 *groups = active + c->groupIterOffset;
            mmbit_unset(groups, c->numGroups, sub->exclusiveId);
        } else {
-            mmbit_unset(active + c->activeIdxSize, c->numRepeats, subIdx);
+            u8 *active = (u8 *)stream_state + c->activeOffset;
            mmbit_unset(active, c->numRepeats, subIdx);
        }
    }
 }
@ -216,30 +230,47 @@ void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
                               void *full_state, void *stream_state) {
    DEBUG_PRINTF("offset=%llu\n", offset);
    if (!c->staleIterOffset) {
        DEBUG_PRINTF("{no repeats can go stale}\n");
        return; /* no subcastle can ever go stale */
    }
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize);
+        u8 *active = (u8 *)stream_state;
-        if (activeIdx < c->numRepeats) {
+        u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
            u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            subCastleDeactivateStaleSubs(c, offset, full_state,
                                         stream_state, activeIdx);
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        const u8 *active = (const u8 *)stream_state + c->activeIdxSize;
+        const u8 *active = (const u8 *)stream_state + c->activeOffset;
-        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+        const struct mmbit_sparse_iter *it
-             i != MMB_INVALID;
+            = (const void *)((const char *)c + c->staleIterOffset);
-             i = mmbit_iterate(active, c->numRepeats, i)) {
+
        struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
        u32 numRepeats = c->numRepeats;
        u32 idx = 0;
        u32 i = mmbit_sparse_iter_begin(active, numRepeats, &idx, it, si_state);
        while(i != MMB_INVALID) {
            DEBUG_PRINTF("subcastle %u\n", i);
-            subCastleDeactivateStaleSubs(c, offset, full_state,
+            subCastleDeactivateStaleSubs(c, offset, full_state, stream_state, i);
-                                         stream_state, i);
+            i = mmbit_sparse_iter_next(active, numRepeats, i, &idx, it,
                                       si_state);
        }
    }
 }
 static really_inline
 void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
-                      void *full_state, void *stream_state) {
+                      void *full_state, void *stream_state,
                      UNUSED char stale_checked) {
    assert(top < c->numRepeats);
    const struct SubCastle *sub = getSubCastle(c, top);
@ -249,12 +280,20 @@ void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
                   info->packedCtrlSize;
    char is_alive = 0;
-    if (sub->exclusive) {
+    u8 *active = (u8 *)stream_state;
-        const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize);
+    if (sub->exclusiveId < c->numRepeats) {
-        is_alive = (activeIdx == top);
+        u8 *groups = active + c->groupIterOffset;
-        partial_store_u32(stream_state, top, c->activeIdxSize);
+        active += sub->exclusiveId * c->activeIdxSize;
        if (mmbit_set(groups, c->numGroups, sub->exclusiveId)) {
            const u32 activeIdx = partial_load_u32(active, c->activeIdxSize);
            is_alive = (activeIdx == top);
        }
        if (!is_alive) {
            partial_store_u32(active, top, c->activeIdxSize);
        }
    } else {
-        u8 *active = (u8 *)stream_state + c->activeIdxSize;
+        active += c->activeOffset;
        is_alive = mmbit_set(active, c->numRepeats, top);
    }
@ -263,8 +302,8 @@ void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset,
    } else {
        DEBUG_PRINTF("repeat %u is already alive\n", top);
        // Caller should ensure we're not stale.
-        assert(repeatHasMatch(info, rctrl, rstate, offset) !=
+        assert(!stale_checked
-               REPEAT_STALE);
+               || repeatHasMatch(info, rctrl, rstate, offset) != REPEAT_STALE);
        // Ignore duplicate top events.
        u64a last = repeatLastTop(info, rctrl, rstate);
@ -292,11 +331,11 @@ void subCastleFindMatch(const struct Castle *c, const u64a begin,
    u64a match = repeatNextMatch(info, rctrl, rstate, begin);
    if (match == 0) {
        DEBUG_PRINTF("no more matches for sub %u\n", subIdx);
-        if (sub->exclusive) {
+        if (sub->exclusiveId < c->numRepeats) {
-            partial_store_u32(stream_state, c->numRepeats,
+            u8 *groups = (u8 *)stream_state + c->groupIterOffset;
-                                  c->activeIdxSize);
+            mmbit_unset(groups, c->numGroups, sub->exclusiveId);
        } else {
-            u8 *active = (u8 *)stream_state + c->activeIdxSize;
+            u8 *active = (u8 *)stream_state + c->activeOffset;
            mmbit_unset(active, c->numRepeats, subIdx);
        }
        return;
@ -329,16 +368,20 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
    *mloc = 0;
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(stream_state, c->activeIdxSize);
+        u8 *active = (u8 *)stream_state;
-        if (activeIdx < c->numRepeats) {
+        u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
            u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
                               &found, activeIdx);
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        u8 *active = (u8 *)stream_state + c->activeIdxSize;
+        u8 *active = (u8 *)stream_state + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
             i != MMB_INVALID;
             i = mmbit_iterate(active, c->numRepeats, i)) {
@ -367,31 +410,38 @@ u64a subCastleNextMatch(const struct Castle *c, void *full_state,
    return repeatNextMatch(info, rctrl, rstate, loc);
 }
 static really_inline
 void set_matching(const struct Castle *c, const u64a match, u8 *active,
                  u8 *matching, const u32 active_size, const u32 active_id,
                  const u32 matching_id, u64a *offset, const u64a end) {
    if (match == 0) {
        DEBUG_PRINTF("no more matches\n");
        mmbit_unset(active, active_size, active_id);
    } else if (match > end) {
        // If we had a local copy of the active mmbit, we could skip
        // looking at this repeat again. But we don't, so we just move
        // on.
    } else if (match == *offset) {
        mmbit_set(matching, c->numRepeats, matching_id);
    } else if (match < *offset) {
        // New minimum offset.
        *offset = match;
        mmbit_clear(matching, c->numRepeats);
        mmbit_set(matching, c->numRepeats, matching_id);
    }
 }
 static really_inline
 void subCastleMatchLoop(const struct Castle *c, void *full_state,
                        void *stream_state, const u64a end,
                        const u64a loc, u64a *offset) {
-    u8 *active = (u8 *)stream_state + c->activeIdxSize;
+    u8 *active = (u8 *)stream_state + c->activeOffset;
    u8 *matching = full_state;
    mmbit_clear(matching, c->numRepeats);
    for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
         i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
        u64a match = subCastleNextMatch(c, full_state, stream_state, loc, i);
-        if (match == 0) {
+        set_matching(c, match, active, matching, c->numRepeats, i,
-            DEBUG_PRINTF("no more matches\n");
+                     i, offset, end);
            mmbit_unset(active, c->numRepeats, i);
        } else if (match > end) {
            // If we had a local copy of the active mmbit, we could skip
            // looking at this repeat again. But we don't, so we just move
            // on.
        } else if (match == *offset) {
            mmbit_set(matching, c->numRepeats, i);
        } else if (match < *offset) {
            // New minimum offset.
            *offset = match;
            mmbit_clear(matching, c->numRepeats);
            mmbit_set(matching, c->numRepeats, i);
        }
    }
 }
@ -434,61 +484,37 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
        // full_state (scratch).
        u64a offset = end; // min offset of next match
        char found = 0;
        u32 activeIdx = 0;
        mmbit_clear(matching, c->numRepeats);
        if (c->exclusive) {
-            activeIdx = partial_load_u32(stream_state, c->activeIdxSize);
+            u8 *active = (u8 *)stream_state;
-            if (activeIdx < c->numRepeats) {
+            u8 *groups = active + c->groupIterOffset;
-                u32 i = activeIdx;
+            for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
-                DEBUG_PRINTF("subcastle %u\n", i);
+                 i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
                u8 *cur = active + i * c->activeIdxSize;
                activeIdx = partial_load_u32(cur, c->activeIdxSize);
                u64a match = subCastleNextMatch(c, full_state, stream_state,
-                                                loc, i);
+                                                loc, activeIdx);
-
+                set_matching(c, match, groups, matching, c->numGroups, i,
-                if (match == 0) {
+                             activeIdx, &offset, end);
                    DEBUG_PRINTF("no more matches\n");
                    partial_store_u32(stream_state, c->numRepeats,
                                      c->activeIdxSize);
                } else if (match > end) {
                    // If we had a local copy of the active mmbit, we could skip
                    // looking at this repeat again. But we don't, so we just move
                    // on.
                } else if (match <= offset) {
                    if (match < offset) {
                        // New minimum offset.
                        offset = match;
                    }
                    found = 1;
                }
            }
        }
-        const char hasMatch = found;
+        if (c->exclusive != PURE_EXCLUSIVE) {
        u64a newOffset = offset;
        if (!c->pureExclusive) {
            subCastleMatchLoop(c, full_state, stream_state,
-                               end, loc, &newOffset);
+                               end, loc, &offset);
            DEBUG_PRINTF("offset=%llu\n", newOffset);
            if (mmbit_any(matching, c->numRepeats)) {
                found = 1;
                if (subCastleFireMatch(c, full_state, stream_state,
                        cb, ctx, newOffset) == MO_HALT_MATCHING) {
                    return MO_HALT_MATCHING;
                }
            }
        }
-
+        DEBUG_PRINTF("offset=%llu\n", offset);
-        if (!found) {
+        if (!mmbit_any(matching, c->numRepeats)) {
            DEBUG_PRINTF("no more matches\n");
            break;
        } else if (hasMatch && offset == newOffset) {
            const struct SubCastle *sub = getSubCastle(c, activeIdx);
            DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, activeIdx);
            if (cb(offset, sub->report, ctx) == MO_HALT_MATCHING) {
                DEBUG_PRINTF("caller told us to halt\n");
                return MO_HALT_MATCHING;
            }
        }
-        loc = newOffset;
+
        if (subCastleFireMatch(c, full_state, stream_state,
                cb, ctx, offset) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING;
        }
        loc = offset;
    }
    return MO_CONTINUE_MATCHING;
@ -547,7 +573,8 @@ char castleScanShufti(const struct Castle *c, const u8 *buf, const size_t begin,
 static really_inline
 char castleScanTruffle(const struct Castle *c, const u8 *buf, const size_t begin,
                      const size_t end, size_t *loc) {
-    const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2, buf + begin, buf + end);
+    const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
                                buf + begin, buf + end);
    if (ptr == buf + end) {
        DEBUG_PRINTF("no escape found\n");
        return 0;
@ -589,7 +616,103 @@ char castleScan(const struct Castle *c, const u8 *buf, const size_t begin,
 }
 static really_inline
-void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp) {
+char castleRevScanVerm(const struct Castle *c, const u8 *buf,
                       const size_t begin, const size_t end, size_t *loc) {
    const u8 *ptr = rvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
    if (ptr == buf + begin - 1) {
        DEBUG_PRINTF("no escape found\n");
        return 0;
    }
    assert(loc);
    assert(ptr >= buf && ptr < buf + end);
    *loc = (size_t)(ptr - buf);
    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
    return 1;
 }
 static really_inline
 char castleRevScanNVerm(const struct Castle *c, const u8 *buf,
                        const size_t begin, const size_t end, size_t *loc) {
    const u8 *ptr = rnvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end);
    if (ptr == buf + begin - 1) {
        DEBUG_PRINTF("no escape found\n");
        return 0;
    }
    assert(loc);
    assert(ptr >= buf && ptr < buf + end);
    *loc = (size_t)(ptr - buf);
    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
    return 1;
 }
 static really_inline
 char castleRevScanShufti(const struct Castle *c, const u8 *buf,
                         const size_t begin, const size_t end, size_t *loc) {
    const m128 mask_lo = c->u.shuf.mask_lo;
    const m128 mask_hi = c->u.shuf.mask_hi;
    const u8 *ptr = rshuftiExec(mask_lo, mask_hi, buf + begin, buf + end);
    if (ptr == buf + begin - 1) {
        DEBUG_PRINTF("no escape found\n");
        return 0;
    }
    assert(loc);
    assert(ptr >= buf && ptr < buf + end);
    *loc = (size_t)(ptr - buf);
    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
    return 1;
 }
 static really_inline
 char castleRevScanTruffle(const struct Castle *c, const u8 *buf,
                          const size_t begin, const size_t end, size_t *loc) {
    const u8 *ptr = rtruffleExec(c->u.truffle.mask1, c->u.truffle.mask2,
                                 buf + begin, buf + end);
    if (ptr == buf + begin - 1) {
        DEBUG_PRINTF("no escape found\n");
        return 0;
    }
    assert(loc);
    assert(ptr >= buf && ptr < buf + end);
    *loc = (size_t)(ptr - buf);
    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
    return 1;
 }
 static really_inline
 char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin,
                const size_t end, size_t *loc) {
    assert(begin <= end);
    DEBUG_PRINTF("scanning backwards over (%zu,%zu]\n", begin, end);
    if (begin == end) {
        return 0;
    }
    switch (c->type) {
    case CASTLE_DOT:
        // Nothing can stop a dot scan!
        return 0;
    case CASTLE_VERM:
        return castleRevScanVerm(c, buf, begin, end, loc);
    case CASTLE_NVERM:
        return castleRevScanNVerm(c, buf, begin, end, loc);
    case CASTLE_SHUFTI:
        return castleRevScanShufti(c, buf, begin, end, loc);
    case CASTLE_TRUFFLE:
        return castleRevScanTruffle(c, buf, begin, end, loc);
    default:
        DEBUG_PRINTF("unknown scan type!\n");
        assert(0);
        return 0;
    }
 }
 static really_inline
 void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp,
                       char stale_checked) {
    const u32 event = q->items[q->cur].type;
    switch (event) {
    case MQE_TOP:
@ -603,11 +726,24 @@ void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp) {
        assert(event < MQE_INVALID);
        u32 top = event - MQE_TOP_FIRST;
        DEBUG_PRINTF("top %u at offset %llu\n", top, sp);
-        castleProcessTop(c, top, sp, q->state, q->streamState);
+        castleProcessTop(c, top, sp, q->state, q->streamState, stale_checked);
        break;
    }
 }
 static really_inline
 void clear_repeats(const struct Castle *c, const struct mq *q, u8 *active) {
    DEBUG_PRINTF("clearing active repeats due to escape\n");
    if (c->exclusive) {
        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
        mmbit_clear(groups, c->numGroups);
    }
    if (c->exclusive != PURE_EXCLUSIVE) {
        mmbit_clear(active, c->numRepeats);
    }
 }
 static really_inline
 char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
                        enum MatchMode mode) {
@ -630,7 +766,7 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
        return 1;
    }
-    u8 *active = (u8 *)q->streamState + c->activeIdxSize; // active multibit
+    u8 *active = (u8 *)q->streamState + c->activeOffset;// active multibit
    assert(q->cur + 1 < q->end); // require at least two items
    assert(q_cur_type(q) == MQE_START);
@ -644,14 +780,8 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
        char found = 0;
        if (c->exclusive) {
-            const u32 activeIdx = partial_load_u32(q->streamState,
+            u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
-                                                   c->activeIdxSize);
+            found = mmbit_any(groups, c->numGroups);
            if (activeIdx < c->numRepeats) {
                found = 1;
            } else if (c->pureExclusive) {
                DEBUG_PRINTF("castle is dead\n");
                goto scan_done;
            }
        }
        if (!found && !mmbit_any(active, c->numRepeats)) {
@ -698,15 +828,7 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
            }
            if (escape_found) {
-                DEBUG_PRINTF("clearing active repeats due to escape\n");
+                clear_repeats(c, q, active);
                if (c->exclusive) {
                    partial_store_u32(q->streamState, c->numRepeats,
                                      c->activeIdxSize);
                }
                if (!c->pureExclusive) {
                    mmbit_clear(active, c->numRepeats);
                }
            }
        }
@ -720,15 +842,14 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
        }
        sp = q_cur_offset(q);
-        castleHandleEvent(c, q, sp);
+        castleHandleEvent(c, q, sp, 1);
        q->cur++;
    }
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(q->streamState,
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
-                                               c->activeIdxSize);
+        if (mmbit_any_precise(groups, c->numGroups)) {
-        if (c->pureExclusive || activeIdx < c->numRepeats) {
+            return 1;
            return activeIdx < c->numRepeats;
        }
    }
@ -745,28 +866,34 @@ char nfaExecCastle0_Q2(const struct NFA *n, struct mq *q, s64a end) {
    return nfaExecCastle0_Q_i(n, q, end, STOP_AT_MATCH);
 }
-static really_inline
+static
-void castleStreamSilent(const struct Castle *c, u8 *active, const u8 *buf,
+s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
-                        size_t length) {
+    assert(q_cur_type(q) == MQE_START);
-    DEBUG_PRINTF("entry\n");
+    assert(q_last_type(q) == MQE_END);
    s64a sp = q_cur_loc(q);
    s64a ep = q_last_loc(q);
-    // This call doesn't produce matches, so we elide the castleMatchLoop call
+    DEBUG_PRINTF("finding final squash in (%lld, %lld]\n", sp, ep);
    // entirely and just do escape scans to maintain the repeat.
-    size_t eloc = 0;
+    size_t loc;
-    char escaped = castleScan(c, buf, 0, length, &eloc);
+
-    if (escaped) {
+    if (ep > 0) {
-        assert(eloc < length);
+        if (castleRevScan(c, q->buffer, sp > 0 ? sp : 0, ep, &loc)) {
-        DEBUG_PRINTF("escape found at %zu, clearing castle\n", eloc);
+            return (s64a)loc;
        if (c->exclusive) {
            partial_store_u32(active - c->activeIdxSize,
                              c->numRepeats, c->activeIdxSize);
        }
        if (!c->pureExclusive) {
            mmbit_clear(active, c->numRepeats);
        }
        ep = 0;
    }
    if (sp < 0) {
        s64a hlen = q->hlength;
        if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
            return (s64a)loc - hlen;
        }
        ep = 0;
    }
    return sp - 1; /* the repeats are never killed */
 }
 char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
@ -780,85 +907,44 @@ char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
    assert(q->cur + 1 < q->end); /* require at least two items */
    assert(q_cur_type(q) == MQE_START);
    u64a sp = q_cur_offset(q);
    q->cur++;
    DEBUG_PRINTF("sp=%llu\n", sp);
    const struct Castle *c = getImplNfa(n);
-    u8 *active = (u8 *)q->streamState + c->activeIdxSize;
+    u8 *active = (u8 *)q->streamState + c->activeOffset;
-    char found = 0;
+
    u64a end_offset = q_last_loc(q) + q->offset;
    s64a last_kill_loc = castleLastKillLoc(c, q);
    DEBUG_PRINTF("all repeats killed at %lld (exec range %lld, %lld)\n",
                 last_kill_loc, q_cur_loc(q), q_last_loc(q));
    assert(last_kill_loc < q_last_loc(q));
    if (last_kill_loc != q_cur_loc(q) - 1) {
        clear_repeats(c, q, active);
    }
    q->cur++; /* skip start event */
    /* skip events prior to the repeats being squashed */
    while (q_cur_loc(q) <= last_kill_loc) {
        DEBUG_PRINTF("skipping moot event at %lld\n", q_cur_loc(q));
        q->cur++;
        assert(q->cur < q->end);
    }
    while (q->cur < q->end) {
        DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q),
                     q_cur_offset(q));
-        found = 0;
+        u64a sp = q_cur_offset(q);
-        if (c->exclusive) {
+        castleHandleEvent(c, q, sp, 0);
            const u32 activeIdx = partial_load_u32(q->streamState,
                                                   c->activeIdxSize);
            if (activeIdx < c->numRepeats) {
                found = 1;
            } else if (c->pureExclusive) {
                DEBUG_PRINTF("castle is dead\n");
                goto scan_done;
            }
        }
        if (!found && !mmbit_any(active, c->numRepeats)) {
            DEBUG_PRINTF("castle is dead\n");
            goto scan_done;
        }
        u64a ep = q_cur_offset(q);
        if (sp < q->offset) {
            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
            assert(q->offset - sp <= q->hlength);
            u64a local_ep = MIN(q->offset, ep);
            const u8 *ptr = q->history + q->hlength + sp - q->offset;
            castleStreamSilent(c, active, ptr, local_ep - sp);
            sp = local_ep;
        }
        found = 0;
        if (c->exclusive) {
            const u32 activeIdx = partial_load_u32(q->streamState,
                                                   c->activeIdxSize);
            if (activeIdx < c->numRepeats) {
                found = 1;
            } else if (c->pureExclusive) {
                DEBUG_PRINTF("castle is dead\n");
                goto scan_done;
            }
        }
        if (!found && !mmbit_any(active, c->numRepeats)) {
            DEBUG_PRINTF("castle is dead\n");
            goto scan_done;
        }
        if (sp < ep) {
            DEBUG_PRINTF("MAIN BUFFER SCAN\n");
            assert(ep - q->offset <= q->length);
            const u8 *ptr = q->buffer + sp - q->offset;
            castleStreamSilent(c, active, ptr, ep - sp);
        }
 scan_done:
        sp = q_cur_offset(q);
        castleDeactivateStaleSubs(c, sp, q->state, q->streamState);
        castleHandleEvent(c, q, sp);
        q->cur++;
    }
-    found = 0;
+    castleDeactivateStaleSubs(c, end_offset, q->state, q->streamState);
    char found = 0;
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(q->streamState,
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
-                                               c->activeIdxSize);
+        found = mmbit_any_precise(groups, c->numGroups);
-        if (activeIdx < c->numRepeats) {
+
            found = 1;
        } else if (c->pureExclusive) {
            DEBUG_PRINTF("castle is dead\n");
            return 0;
        }
    }
    if (!found && !mmbit_any_precise(active, c->numRepeats)) {
@ -866,7 +952,7 @@ scan_done:
        return 0;
    }
-    if (castleInAccept(c, q, report, sp)) {
+    if (castleInAccept(c, q, report, end_offset)) {
        return MO_MATCHES_PENDING;
    }
@ -901,11 +987,12 @@ char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
    const struct Castle *c = getImplNfa(n);
    assert(q->streamState);
    if (c->exclusive) {
-        partial_store_u32(q->streamState, c->numRepeats, c->activeIdxSize);
+        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
        mmbit_clear(groups, c->numGroups);
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        u8 *active = (u8 *)q->streamState + c->activeIdxSize;
+        u8 *active = (u8 *)q->streamState + c->activeOffset;
        mmbit_clear(active, c->numRepeats);
    }
    return 0;
@ -919,11 +1006,12 @@ char nfaExecCastle0_initCompressedState(const struct NFA *n, UNUSED u64a offset,
    const struct Castle *c = getImplNfa(n);
    if (c->exclusive) {
-        partial_store_u32(state, c->numRepeats, c->activeIdxSize);
+        u8 *groups = (u8 *)state + c->groupIterOffset;
        mmbit_clear(groups, c->numGroups);
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        u8 *active = (u8 *)state + c->activeIdxSize;
+        u8 *active = (u8 *)state + c->activeOffset;
        mmbit_clear(active, c->numRepeats);
    }
    return 0;
@ -954,16 +1042,19 @@ char nfaExecCastle0_queueCompressState(const struct NFA *n, const struct mq *q,
    const u64a offset = q->offset + loc;
    DEBUG_PRINTF("offset=%llu\n", offset);
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(q->streamState,
+        u8 *active = (u8 *)q->streamState;
-                                               c->activeIdxSize);
+        u8 *groups = active + c->groupIterOffset;
-        if (activeIdx < c->numRepeats) {
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
            u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
            subCastleQueueCompressState(c, activeIdx, q, offset);
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
-        const u8 *active = (const u8 *)q->streamState + c->activeIdxSize;
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
            DEBUG_PRINTF("packing state for sub %u\n", i);
@ -997,15 +1088,19 @@ char nfaExecCastle0_expandState(const struct NFA *n, void *dest,
    const struct Castle *c = getImplNfa(n);
    if (c->exclusive) {
-        const u32 activeIdx = partial_load_u32(src, c->activeIdxSize);
+        const u8 *active = (const u8 *)src;
-        if (activeIdx < c->numRepeats) {
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            subCastleExpandState(c, activeIdx, dest, src, offset);
        }
    }
-    if (!c->pureExclusive) {
+    if (c->exclusive != PURE_EXCLUSIVE) {
        // Unpack state for all active repeats.
-        const u8 *active = (const u8 *)src + c->activeIdxSize;
+        const u8 *active = (const u8 *)src + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
            subCastleExpandState(c, i, dest, src, offset);
@ -1013,4 +1108,3 @@ char nfaExecCastle0_expandState(const struct NFA *n, void *dest,
    }
    return 0;
 }
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@ -100,6 +100,7 @@ void nfaExecCastle0_dumpText(const struct NFA *nfa, FILE *f) {
        fprintf(f, "unknown type %u\n", c->type);
        break;
    }
    fprintf(f, "Stale Iter Offset:          %u\n", c->staleIterOffset);
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
--- a/src/nfa/castle_internal.h
+++ b/src/nfa/castle_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,9 @@ struct SubCastle {
    u32 streamStateOffset;  //!< offset within stream state
    u32 repeatInfoOffset;   //!< offset of RepeatInfo structure
                            //   relative to the start of SubCastle
-    char exclusive;         //!< exclusive info of this SubCastle
+    u32 exclusiveId;        //!< exclusive group id of this SubCastle,
                            //   set to the number of SubCastles in Castle
                            //   if it is not exclusive
 };
 #define CASTLE_DOT 0
@ -51,6 +53,12 @@ struct SubCastle {
 #define CASTLE_SHUFTI 3
 #define CASTLE_TRUFFLE 4
 enum ExclusiveType {
    NOT_EXCLUSIVE,     //!< no subcastles are exclusive
    EXCLUSIVE,         //!< a subset of subcastles are exclusive
    PURE_EXCLUSIVE     //!< all subcastles are exclusive
 };
 /**
 * \brief Castle engine structure.
 *
@ -63,26 +71,60 @@ struct SubCastle {
 * - struct Castle
 * - struct SubCastle[numRepeats]
 * - tables for sparse model repeats
 * - sparse iterator for subcastles that may be stale
 *
 * Castle stores an "active repeats" multibit in stream state, followed by the
- * packed repeat state for each SubCastle. If all SubCastles are mutual
+ * packed repeat state for each SubCastle. If there are both exclusive and
- * exclusive, we store current active SubCastle id instead of "active repeats"
+ * non-exclusive SubCastle groups, we use an active id for each exclusive group
- * multibit in stream state. If there are both exclusive and non-exclusive
+ * and a multibit for the non-exclusive group. We also store an "active
- * SubCastle groups, we use an active id for the exclusive group and a multibit
+ * exclusive groups" multibit for exclusive groups. If all SubCastles are mutual
- * for the non-exclusive group.
+ * exclusive, we remove "active repeats" multibit from stream state.
 * * Castle stream state:
 * *
 * * |---|
 * * |   | active subengine id for exclusive group 1
 * * |---|
 * * |   | active subengine id for exclusive group 2(if necessary)
 * * |---|
 * * ...
 * * |---|
 * * |   | "active repeats" multibit for non-exclusive subcastles
 * * |   | (if not all subcastles are exclusive)
 * * |---|
 * * |   | active multibit for exclusive groups
 * * |   |
 * * |---|
 * * ||-|| common pool of stream state for exclusive group 1
 * * ||-||
 * * |---|
 * * ||-|| common pool of stream state for exclusive group 2(if necessary)
 * * ||-||
 * * |---|
 * * ...
 * * |---|
 * * |   | stream state for each non-exclusive subcastles
 * * ...
 * * |   |
 * * |---|
 *
 * In full state (stored in scratch space) it stores a temporary multibit over
 * the repeats (used by \ref castleMatchLoop), followed by the repeat control
- * blocks for each SubCastle. If all SubCastles are mutual exclusive, we only
+ * blocks for each SubCastle.
 * need to store the repeat control blocks for each SubCastle.
 */
 struct ALIGN_AVX_DIRECTIVE Castle {
-    u32 numRepeats;
+    u32 numRepeats;         //!< number of repeats in Castle
-    u8 type; //!< tells us which scanning mechanism (below) to use
+    u32 numGroups;          //!< number of exclusive groups
-    char exclusive; //!< tells us if there are mutual exclusive SubCastles
+    u8 type;                //!< tells us which scanning mechanism (below) to use
-    char pureExclusive; //!< tells us if all SubCastles are mutual exclusive
+    u8 exclusive;           //!< tells us if there are mutual exclusive SubCastles
-    u8 activeIdxSize; //!< number of bytes in stream state to store
+    u8 activeIdxSize;       //!< number of bytes in stream state to store
-                      // active SubCastle id for exclusive mode
+                            // active SubCastle id for exclusive mode
    u32 activeOffset;       //!< offset to active multibit for non-exclusive
                            // SubCastles
    u32 staleIterOffset;    //!< offset to a sparse iterator to check for stale
                            // sub castles
    u32 groupIterOffset;    //!< offset to a iterator to check the aliveness of
                            // exclusive groups
    union {
        struct {
            char c;
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -32,6 +32,7 @@
 #include "castlecompile.h"
 #include "castle_internal.h"
 #include "limex_limits.h"
 #include "nfa_internal.h"
 #include "repeatcompile.h"
 #include "shufticompile.h"
@ -47,7 +48,9 @@
 #include "util/dump_charclass.h"
 #include "util/graph.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/multibit_internal.h"
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
 #include "grey.h"
@ -63,7 +66,6 @@ using boost::adaptors::map_values;
 namespace ue2 {
 #define CASTLE_MAX_TOPS 32
 #define CLIQUE_GRAPH_MAX_SIZE 1000
 static
@ -204,7 +206,7 @@ bool graph_empty(const Graph &g) {
 static
 vector<u32> removeClique(CliqueGraph &cg) {
    vector<vector<u32>> cliquesVec(1);
-    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
+    DEBUG_PRINTF("graph size:%zu\n", num_vertices(cg));
    findCliqueGroup(cg, cliquesVec[0]);
    while (!graph_empty(cg)) {
        const vector<u32> &c = cliquesVec.back();
@ -236,7 +238,7 @@ vector<u32> removeClique(CliqueGraph &cg) {
        }
    }
-    DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
+    DEBUG_PRINTF("clique size:%zu\n", cliquesVec[id].size());
    return cliquesVec[id];
 }
@ -244,17 +246,18 @@ vector<u32> removeClique(CliqueGraph &cg) {
 // the end locations where it overlaps with other literals,
 // then the literals are mutual exclusive
 static
-bool findExclusivePair(const u32 id1, const u32 id2,
+bool findExclusivePair(const size_t id1, const size_t id2,
                       const size_t lower,
                       const vector<vector<size_t>> &min_reset_dist,
                       const vector<vector<vector<CharReach>>> &triggers) {
    const auto &triggers1 = triggers[id1];
    const auto &triggers2 = triggers[id2];
-    for (u32 i = 0; i < triggers1.size(); ++i) {
+    for (size_t i = 0; i < triggers1.size(); ++i) {
-        for (u32 j = 0; j < triggers2.size(); ++j) {
+        for (size_t j = 0; j < triggers2.size(); ++j) {
            if (!literalOverlap(triggers1[i], triggers2[j],
-                                min_reset_dist[id2][j]) ||
+                                min_reset_dist[id2 - lower][j]) ||
                !literalOverlap(triggers2[j], triggers1[i],
-                                min_reset_dist[id1][i])) {
+                                min_reset_dist[id1 - lower][i])) {
                return false;
            }
        }
@ -263,40 +266,75 @@ bool findExclusivePair(const u32 id1, const u32 id2,
 }
 static
-vector<u32> checkExclusion(const CharReach &cr,
+vector<vector<u32>> checkExclusion(u32 &streamStateSize,
-                           const vector<vector<vector<CharReach>>> &triggers) {
+                       const CharReach &cr,
-    vector<u32> group;
+                       const vector<vector<vector<CharReach>>> &triggers,
-    if (!triggers.size() || triggers.size() == 1) {
+                       enum ExclusiveType &exclusive,
-        return group;
+                       const size_t numRepeats) {
-    }
+    vector<vector<u32>> groups;
    size_t trigSize = triggers.size();
    DEBUG_PRINTF("trigSize %zu\n", trigSize);
-    vector<vector<size_t>> min_reset_dist;
+    size_t lower = 0;
-    // get min reset distance for each repeat
+    size_t total = 0;
-    for (auto it = triggers.begin(); it != triggers.end(); it++) {
+    while (lower < trigSize) {
-        const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr);
+        vector<CliqueVertex> vertices;
-        min_reset_dist.push_back(tmp_dist);
+        unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
    }
-    vector<CliqueVertex> vertices;
+        vector<vector<size_t>> min_reset_dist;
-    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+        size_t upper = min(lower + CLIQUE_GRAPH_MAX_SIZE, trigSize);
-    for (u32 i = 0; i < triggers.size(); ++i) {
+        // get min reset distance for each repeat
-        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
+        for (size_t i = lower; i < upper; i++) {
-        vertices.push_back(v);
+            CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-    }
+            vertices.push_back(v);
-    // find exclusive pair for each repeat
+            const vector<size_t> &tmp_dist =
-    for (u32 i = 0; i < triggers.size(); ++i) {
+                minResetDistToEnd(triggers[i], cr);
-        CliqueVertex s = vertices[i];
+            min_reset_dist.push_back(tmp_dist);
-        for (u32 j = i + 1; j < triggers.size(); ++j) {
+        }
-            if (findExclusivePair(i, j, min_reset_dist, triggers)) {
+
-                CliqueVertex d = vertices[j];
+        // find exclusive pair for each repeat
-                add_edge(s, d, *cg);
+        for (size_t i = lower; i < upper; i++) {
            CliqueVertex s = vertices[i - lower];
            for (size_t j = i + 1; j < upper; j++) {
                if (findExclusivePair(i, j, lower, min_reset_dist,
                                      triggers)) {
                    CliqueVertex d = vertices[j - lower];
                    add_edge(s, d, *cg);
                }
            }
        }
    }
-    // find the largest exclusive group
+        // find the largest exclusive group
-    return removeClique(*cg);
+        auto clique = removeClique(*cg);
        size_t cliqueSize = clique.size();
        if (cliqueSize > 1) {
            groups.push_back(clique);
            exclusive = EXCLUSIVE;
            total += cliqueSize;
        }
        lower += CLIQUE_GRAPH_MAX_SIZE;
    }
    DEBUG_PRINTF("clique size %zu, num of repeats %zu\n",
                 total, numRepeats);
    if (total == numRepeats) {
        exclusive = PURE_EXCLUSIVE;
        streamStateSize = 0;
    };
    return groups;
 }
 namespace {
 struct ExclusiveInfo {
    /** Mapping between top and exclusive group id */
    map<u32, u32> groupId;
    /** Number of exclusive groups */
    u32 numGroups = 0;
 };
 }
 static
@ -305,10 +343,15 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
                     const vector<pair<depth, bool>> &repeatInfoPair,
                     u32 &scratchStateSize, u32 &streamStateSize,
                     u32 &tableSize, vector<u64a> &tables, u32 &sparseRepeats,
-                     const set<u32> &exclusiveGroup) {
+                     const ExclusiveInfo &exclusiveInfo,
                     vector<u32> &may_stale, const ReportManager &rm) {
    const bool remap_reports = has_managed_reports(proto.kind);
    u32 i = 0;
-    u32 maxStreamSize = 0;
+    const auto &groupId = exclusiveInfo.groupId;
-    bool exclusive = exclusiveGroup.size() > 1;
+    const auto &numGroups = exclusiveInfo.numGroups;
    vector<u32> maxStreamSize(numGroups, 0);
    for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
         it != ite; ++it, ++i) {
        const PureRepeat &pr = it->second;
@ -316,33 +359,35 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
        bool is_reset = repeatInfoPair[i].second;
        enum RepeatType rtype = chooseRepeatType(pr.bounds.min, pr.bounds.max,
-                                                 min_period, is_reset);
+                                                 min_period, is_reset, true);
        RepeatStateInfo rsi(rtype, pr.bounds.min, pr.bounds.max, min_period);
        DEBUG_PRINTF("sub %u: selected %s model for %s repeat\n", i,
                     repeatTypeName(rtype), pr.bounds.str().c_str());
        u32 subScratchStateSize;
        u32 subStreamStateSize;
        SubCastle &sub = subs[i];
        RepeatInfo &info = infos[i];
-        // handle exclusive case differently
+        info.packedCtrlSize = rsi.packedCtrlSize;
-        if (exclusive && exclusiveGroup.find(i) != exclusiveGroup.end()) {
+        u32 subStreamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);
            maxStreamSize = MAX(maxStreamSize, rsi.packedCtrlSize);
        } else {
            subScratchStateSize = verify_u32(sizeof(RepeatControl));
            subStreamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);
-            info.packedCtrlSize = rsi.packedCtrlSize;
+        // Handle stream/scratch space alloc for exclusive case differently.
        if (contains(groupId, i)) {
            u32 id = groupId.at(i);
            maxStreamSize[id] = max(maxStreamSize[id], subStreamStateSize);
            // SubCastle full/stream state offsets are written in for the group
            // below.
        } else {
            sub.fullStateOffset = scratchStateSize;
            sub.streamStateOffset = streamStateSize;
-
+            scratchStateSize += verify_u32(sizeof(RepeatControl));
            scratchStateSize += subScratchStateSize;
            streamStateSize += subStreamStateSize;
        }
        if (pr.bounds.max.is_finite()) {
            may_stale.push_back(i);
        }
        info.type = verify_u8(rtype);
        info.repeatMin = depth_to_u32(pr.bounds.min);
        info.repeatMax = depth_to_u32(pr.bounds.max);
@ -358,35 +403,44 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
        info.encodingSize = rsi.encodingSize;
        info.patchesOffset = rsi.patchesOffset;
-        sub.report = *pr.reports.begin();
+        assert(pr.reports.size() == 1);
        ReportID id = *pr.reports.begin();
        sub.report = remap_reports ? rm.getProgramOffset(id) : id;
        if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
-           for (u32 j = 0; j < rsi.patchSize; j++) {
+            for (u32 j = 0; j < rsi.patchSize; j++) {
-               tables.push_back(rsi.table[j]);
+                tables.push_back(rsi.table[j]);
-           }
+            }
-           sparseRepeats++;
+            sparseRepeats++;
-           patchSize[i] = rsi.patchSize;
+            patchSize[i] = rsi.patchSize;
-           tableSize += rsi.patchSize;
+            tableSize += rsi.patchSize;
        }
    }
-    if (exclusive) {
+    vector<u32> scratchOffset(numGroups, 0);
-        for (auto k : exclusiveGroup) {
+    vector<u32> streamOffset(numGroups, 0);
-            SubCastle &sub = subs[k];
+    for (const auto &j : groupId) {
-            RepeatInfo &info = infos[k];
+        u32 top = j.first;
-            info.packedCtrlSize = maxStreamSize;
+        u32 id = j.second;
        SubCastle &sub = subs[top];
        if (!scratchOffset[id]) {
            sub.fullStateOffset = scratchStateSize;
            sub.streamStateOffset = streamStateSize;
            scratchOffset[id] = scratchStateSize;
            streamOffset[id] = streamStateSize;
            scratchStateSize += verify_u32(sizeof(RepeatControl));
            streamStateSize += maxStreamSize[id];
        } else {
            sub.fullStateOffset = scratchOffset[id];
            sub.streamStateOffset = streamOffset[id];
        }
        scratchStateSize += verify_u32(sizeof(RepeatControl));
        streamStateSize += maxStreamSize;
    }
 }
 aligned_unique_ptr<NFA>
 buildCastle(const CastleProto &proto,
            const map<u32, vector<vector<CharReach>>> &triggers,
-            const CompileContext &cc) {
+            const CompileContext &cc, const ReportManager &rm) {
    assert(cc.grey.allowCastle);
    const size_t numRepeats = proto.repeats.size();
@ -418,8 +472,9 @@ buildCastle(const CastleProto &proto,
    depth maxWidth(0);
    u32 i = 0;
-    vector<u32> candidateRepeats;
+    ExclusiveInfo exclusiveInfo;
    vector<vector<vector<CharReach>>> candidateTriggers;
    vector<u32> candidateRepeats;
    vector<pair<depth, bool>> repeatInfoPair;
    for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
         it != ite; ++it, ++i) {
@ -454,49 +509,60 @@ buildCastle(const CastleProto &proto,
        repeatInfoPair.push_back(make_pair(min_period, is_reset));
-        if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
+        candidateTriggers.push_back(triggers.at(top));
-            candidateTriggers.push_back(triggers.at(top));
+        candidateRepeats.push_back(i);
            candidateRepeats.push_back(i);
        }
    }
    // Case 1: exclusive repeats
-    bool exclusive = false;
+    enum ExclusiveType exclusive = NOT_EXCLUSIVE;
    bool pureExclusive = false;
    u32 activeIdxSize = 0;
-    set<u32> exclusiveGroup;
+    u32 groupIterOffset = 0;
    if (cc.grey.castleExclusive) {
-        vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
+        auto cliqueGroups =
-        const u32 exclusiveSize = tmpGroup.size();
+            checkExclusion(streamStateSize, cr, candidateTriggers,
-        if (exclusiveSize > 1) {
+                           exclusive, numRepeats);
-            // Case 1: mutual exclusive repeats group found, initialize state
+        for (const auto &group : cliqueGroups) {
-            // sizes
+            // mutual exclusive repeats group found,
-            exclusive = true;
+            // update state sizes
            activeIdxSize = calcPackedBytes(numRepeats + 1);
            if (exclusiveSize == numRepeats) {
                pureExclusive = true;
                streamStateSize = 0;
                scratchStateSize = 0;
            }
            streamStateSize += activeIdxSize;
            // replace with top values
-            for (const auto &val : tmpGroup) {
+            for (const auto &val : group) {
-                exclusiveGroup.insert(candidateRepeats[val]);
+                const u32 top = candidateRepeats[val];
                exclusiveInfo.groupId[top] = exclusiveInfo.numGroups;
            }
            exclusiveInfo.numGroups++;
        }
        if (exclusive) {
            groupIterOffset = streamStateSize;
            streamStateSize += mmbit_size(exclusiveInfo.numGroups);
        }
        DEBUG_PRINTF("num of groups:%u\n", exclusiveInfo.numGroups);
    }
    candidateRepeats.clear();
    DEBUG_PRINTF("reach %s exclusive %u\n", describeClass(cr).c_str(),
                 exclusive);
    u32 tableSize = 0;
    u32 sparseRepeats = 0;
    vector<u32> may_stale; /* sub castles that may go stale */
    buildSubcastles(proto, subs, infos, patchSize, repeatInfoPair,
                    scratchStateSize, streamStateSize, tableSize,
-                    tables, sparseRepeats, exclusiveGroup);
+                    tables, sparseRepeats, exclusiveInfo, may_stale, rm);
-    const size_t total_size =
+    DEBUG_PRINTF("%zu subcastles may go stale\n", may_stale.size());
    vector<mmbit_sparse_iter> stale_iter;
    if (!may_stale.empty()) {
        mmbBuildSparseIterator(stale_iter, may_stale, numRepeats);
    }
    size_t total_size =
        sizeof(NFA) +                      // initial NFA structure
        sizeof(Castle) +                   // Castle structure
        sizeof(SubCastle) * subs.size() +  // SubCastles themselves
@ -506,6 +572,9 @@ buildCastle(const CastleProto &proto,
        sizeof(u64a) * sparseRepeats;      // paddings for
                                           // REPEAT_SPARSE_OPTIMAL_P tables
    total_size = ROUNDUP_N(total_size, alignof(mmbit_sparse_iter));
    total_size += byte_length(stale_iter); // stale sparse iter
    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
    nfa->type = verify_u8(CASTLE_NFA_0);
    nfa->length = verify_u32(total_size);
@ -515,12 +584,15 @@ buildCastle(const CastleProto &proto,
    nfa->minWidth = verify_u32(minWidth);
    nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;
-    char *ptr = (char *)nfa.get() + sizeof(NFA);
+    char * const base_ptr = (char *)nfa.get() + sizeof(NFA);
    char *ptr = base_ptr;
    Castle *c = (Castle *)ptr;
    c->numRepeats = verify_u32(subs.size());
-    c->exclusive = exclusive;
+    c->numGroups = exclusiveInfo.numGroups;
-    c->pureExclusive = pureExclusive;
+    c->exclusive = verify_s8(exclusive);
    c->activeIdxSize = verify_u8(activeIdxSize);
    c->activeOffset = verify_u32(c->numGroups * activeIdxSize);
    c->groupIterOffset = groupIterOffset;
    writeCastleScanEngine(cr, c);
@ -554,12 +626,22 @@ buildCastle(const CastleProto &proto,
        }
        // set exclusive group info
-        if (exclusiveGroup.find(i) != exclusiveGroup.end()) {
+        if (contains(exclusiveInfo.groupId, i)) {
-            sub->exclusive = 1;
+            sub->exclusiveId = exclusiveInfo.groupId[i];
        } else {
-            sub->exclusive = 0;
+            sub->exclusiveId = numRepeats;
        }
    }
    ptr = base_ptr + total_size - sizeof(NFA) - byte_length(stale_iter);
    assert(ptr + byte_length(stale_iter) == base_ptr + total_size - sizeof(NFA));
    if (!stale_iter.empty()) {
        c->staleIterOffset = verify_u32(ptr - base_ptr);
        copy_bytes(ptr, stale_iter);
        ptr += byte_length(stale_iter);
    }
    return nfa;
 }
@ -603,7 +685,7 @@ depth findMaxWidth(const CastleProto &proto, u32 top) {
    return proto.repeats.at(top).bounds.max;
 }
-CastleProto::CastleProto(const PureRepeat &pr) {
+CastleProto::CastleProto(nfa_kind k, const PureRepeat &pr) : kind(k) {
    assert(pr.reach.any());
    assert(pr.reports.size() == 1);
    u32 top = 0;
@ -665,6 +747,7 @@ u32 CastleProto::merge(const PureRepeat &pr) {
 bool mergeCastle(CastleProto &c1, const CastleProto &c2,
                 map<u32, u32> &top_map) {
    assert(&c1 != &c2);
    assert(c1.kind == c2.kind);
    DEBUG_PRINTF("c1 has %zu repeats, c2 has %zu repeats\n", c1.repeats.size(),
                 c2.repeats.size());
@ -738,6 +821,7 @@ bool is_equal(const CastleProto &c1, ReportID report1, const CastleProto &c2,
              ReportID report2) {
    assert(!c1.repeats.empty());
    assert(!c2.repeats.empty());
    assert(c1.kind == c2.kind);
    if (c1.reach() != c2.reach()) {
        DEBUG_PRINTF("different reach\n");
@ -784,6 +868,7 @@ bool is_equal(const CastleProto &c1, ReportID report1, const CastleProto &c2,
 bool is_equal(const CastleProto &c1, const CastleProto &c2) {
    assert(!c1.repeats.empty());
    assert(!c2.repeats.empty());
    assert(c1.kind == c2.kind);
    if (c1.reach() != c2.reach()) {
        DEBUG_PRINTF("different reach\n");
@ -877,7 +962,7 @@ bool hasZeroMinBound(const CastleProto &proto) {
    return false;
 }
-unique_ptr<NGHolder> makeHolder(const CastleProto &proto, nfa_kind kind,
+unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
                                const CompileContext &cc) {
    assert(!proto.repeats.empty());
@ -890,10 +975,10 @@ unique_ptr<NGHolder> makeHolder(const CastleProto &proto, nfa_kind kind,
        }
    }
-    unique_ptr<NGHolder> g = ue2::make_unique<NGHolder>(kind);
+    auto g = ue2::make_unique<NGHolder>(proto.kind);
    for (const auto &m : proto.repeats) {
-        if (m.first >= CASTLE_MAX_TOPS) {
+        if (m.first >= NFA_MAX_TOP_MASKS) {
            DEBUG_PRINTF("top %u too big for an NFA\n", m.first);
            return nullptr;
        }
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -51,6 +51,7 @@ namespace ue2 {
 class CharReach;
 class NGHolder;
 class ReportManager;
 struct CompileContext;
 /**
@ -65,7 +66,7 @@ struct CompileContext;
 */
 struct CastleProto {
    static constexpr size_t max_occupancy = 65536; // arbitrary limit
-    explicit CastleProto(const PureRepeat &pr);
+    CastleProto(nfa_kind k, const PureRepeat &pr);
    const CharReach &reach() const;
    /** \brief Add a new repeat. */
@ -94,6 +95,9 @@ struct CastleProto {
     * so we track this explicitly instead of using repeats.size().
     */
    u32 next_top = 1;
    /** \brief Kind for this engine. */
    nfa_kind kind;
 };
 std::set<ReportID> all_reports(const CastleProto &proto);
@ -119,7 +123,7 @@ void remapCastleTops(CastleProto &proto, std::map<u32, u32> &top_map);
 ue2::aligned_unique_ptr<NFA>
 buildCastle(const CastleProto &proto,
            const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
-            const CompileContext &cc);
+            const CompileContext &cc, const ReportManager &rm);
 /**
 * \brief Merge two CastleProto prototypes together, if possible.
@ -155,7 +159,7 @@ bool requiresDedupe(const CastleProto &proto,
 /**
 * \brief Build an NGHolder from a CastleProto.
 */
-std::unique_ptr<NGHolder> makeHolder(const CastleProto &castle, nfa_kind kind,
+std::unique_ptr<NGHolder> makeHolder(const CastleProto &castle,
                                     const CompileContext &cc);
 } // namespace ue2
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -1049,15 +1049,16 @@ char nfaExecGough16_inAccept(const struct NFA *n, ReportID report,
 }
 static
-void goughCheckEOD(const struct NFA *nfa, u16 s,
+char goughCheckEOD(const struct NFA *nfa, u16 s,
                   const struct gough_som_info *som,
                   u64a offset, SomNfaCallback cb, void *ctxt) {
    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
    const struct mstate_aux *aux = get_aux(m, s);
-    if (aux->accept_eod) {
+    if (!aux->accept_eod) {
-        doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL);
+        return MO_CONTINUE_MATCHING;
    }
    return doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL);
 }
 char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
@ -1065,8 +1066,8 @@ char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
                          UNUSED NfaCallback callback,
                          SomNfaCallback som_callback, void *context) {
    const struct gough_som_info *som = getSomInfoConst(state);
-    goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback, context);
+    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback,
-    return 0;
+                         context);
 }
 char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
@ -1075,8 +1076,8 @@ char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
                           SomNfaCallback som_callback, void *context) {
    assert(ISALIGNED_N(state, 8));
    const struct gough_som_info *som = getSomInfoConst(state);
-    goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback, context);
+    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback,
-    return 0;
+                         context);
 }
 char nfaExecGough8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -41,8 +41,9 @@
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
-#include "util/verify_types.h"
+#include "util/report_manager.h"
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
 #include "ue2common.h"
@ -77,18 +78,20 @@ namespace {
 class gough_build_strat : public mcclellan_build_strat {
 public:
-    gough_build_strat(raw_som_dfa &r, const GoughGraph &g,
+    gough_build_strat(
-            const map<dstate_id_t, gough_accel_state_info> &accel_info)
+        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm,
-        : mcclellan_build_strat(r), rdfa(r), gg(g),
+        const map<dstate_id_t, gough_accel_state_info> &accel_info)
        : mcclellan_build_strat(r, rm), rdfa(r), gg(g),
          accel_gough_info(accel_info) {}
    unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
                            vector<u32> &reports_eod /* out */,
                            u8 *isSingleReport /* out */,
                            ReportID *arbReport  /* out */) const override;
-    void find_escape_strings(dstate_id_t this_idx,
+    AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
                             escape_info *out) const override;
    size_t accelSize(void) const override { return sizeof(gough_accel); }
-    void buildAccel(dstate_id_t this_idx, void *accel_out) override;
+    void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                    void *accel_out) override;
    u32 max_allowed_offset_accel() const override { return 0; }
    raw_som_dfa &rdfa;
    const GoughGraph &gg;
@ -1034,7 +1037,8 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
 }
 aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc) {
+                                     const CompileContext &cc,
                                     const ReportManager &rm) {
    assert(somPrecision == 2 || somPrecision == 4 || somPrecision == 8
           || !cc.streaming);
@ -1066,7 +1070,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    map<dstate_id_t, gough_accel_state_info> accel_allowed;
    find_allowed_accel_states(*cfg, blocks, &accel_allowed);
-    gough_build_strat gbs(raw, *cfg, accel_allowed);
+    gough_build_strat gbs(raw, *cfg, rm, accel_allowed);
    aligned_unique_ptr<NFA> basic_dfa = mcclellanCompile_i(raw, gbs, cc);
    assert(basic_dfa);
    if (!basic_dfa) {
@ -1145,32 +1149,44 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    return gough_dfa;
 }
-void gough_build_strat::find_escape_strings(dstate_id_t this_idx,
+AccelScheme gough_build_strat::find_escape_strings(dstate_id_t this_idx) const {
-                                            escape_info *out) const {
+    AccelScheme rv;
    if (!contains(accel_gough_info, this_idx)) {
-        out->outs = CharReach::dot();
+        rv.cr = CharReach::dot();
-        out->outs2_broken = true;
+        rv.double_byte.clear();
-        return;
+        return rv;
    }
-    mcclellan_build_strat::find_escape_strings(this_idx, out);
+    rv = mcclellan_build_strat::find_escape_strings(this_idx);
-    if (!accel_gough_info.at(this_idx).two_byte) {
+    assert(!rv.offset || rv.cr.all()); /* should have been limited by strat */
-        out->outs2_broken = true;
+    if (rv.offset) {
        rv.cr = CharReach::dot();
        rv.double_byte.clear();
        return rv;
    }
    if (rv.double_offset
        || !accel_gough_info.at(this_idx).two_byte) {
        rv.double_byte.clear();
    }
    return rv;
 }
-void gough_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) {
+void gough_build_strat::buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                                   void *accel_out) {
    assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux));
    gough_accel *accel = (gough_accel *)accel_out;
    /* build a plain accelaux so we can work out where we can get to */
-    mcclellan_build_strat::buildAccel(this_idx, &accel->accel);
+    mcclellan_build_strat::buildAccel(this_idx, info, &accel->accel);
    DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx,
                 accel->accel.accel_type);
    if (accel->accel.accel_type == ACCEL_NONE) {
        return;
    }
    assert(!accel->accel.generic.offset);
    assert(contains(accel_gough_info, this_idx));
    accel->margin_dist = verify_u8(accel_gough_info.at(this_idx).margin);
    built_accel[accel] = this_idx;
@ -1182,10 +1198,11 @@ namespace {
 struct raw_gough_report_list {
    set<som_report> reports;
-    explicit raw_gough_report_list(
+    raw_gough_report_list(
-        const vector<pair<ReportID, GoughSSAVar *>> &raw_reports) {
+        const vector<pair<ReportID, GoughSSAVar *>> &raw_reports,
        const ReportManager &rm, bool do_remap) {
        for (const auto &m : raw_reports) {
-            ReportID r = m.first;
+            ReportID r = do_remap ? rm.getProgramOffset(m.first) : m.first;
            u32 impl_slot = INVALID_SLOT;
            if (m.second) {
                impl_slot = m.second->slot;
@ -1214,11 +1231,13 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
                                                  vector<u32> &reports_eod,
                                                  u8 *isSingleReport,
                                                  ReportID *arbReport) const {
    unique_ptr<raw_gough_report_info_impl> ri =
        ue2::make_unique<raw_gough_report_info_impl>();
    map<raw_gough_report_list, u32> rev;
    DEBUG_PRINTF("gathering reports\n");
    const bool remap_reports = has_managed_reports(rdfa.kind);
    auto ri = ue2::make_unique<raw_gough_report_info_impl>();
    map<raw_gough_report_list, u32> rev;
    assert(!rdfa.states.empty());
    vector<GoughVertex> verts(rdfa.states.size());
@ -1237,7 +1256,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
            continue;
        }
-        raw_gough_report_list rrl(gg[v].reports);
+        raw_gough_report_list rrl(gg[v].reports, rm, remap_reports);
        DEBUG_PRINTF("non empty r %zu\n", reports.size());
        if (rev.find(rrl) != rev.end()) {
            reports.push_back(rev[rrl]);
@ -1256,7 +1275,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
        }
        DEBUG_PRINTF("non empty r eod\n");
-        raw_gough_report_list rrl(gg[v].reports_eod);
+        raw_gough_report_list rrl(gg[v].reports_eod, rm, remap_reports);
        if (rev.find(rrl) != rev.end()) {
            reports_eod.push_back(rev[rrl]);
            continue;
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@ -89,7 +89,8 @@ struct raw_som_dfa : public raw_dfa {
 };
 aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc);
+                                     const CompileContext &cc,
                                     const ReportManager &rm);
 } // namespace ue2
--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@ -130,6 +130,9 @@ char repeatIsDead(const struct RepeatInfo *info,
        return lstate->ctrl.ring.offset == REPEAT_DEAD;
    case REPEAT_TRAILER:
        return lstate->ctrl.trailer.offset == REPEAT_DEAD;
    case REPEAT_ALWAYS:
        assert(!"REPEAT_ALWAYS should only be used by Castle");
        return 0;
    }
    assert(0);
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -65,14 +65,13 @@ extern "C"
                                       void *state, u8 key);                   \
    char gf_name##_B_Reverse(const struct NFA *n, u64a offset, const u8 *buf,  \
                             size_t buflen, const u8 *hbuf, size_t hlen,       \
-                             struct hs_scratch *scratch, NfaCallback cb,       \
+                             NfaCallback cb, void *context);                   \
                             void *context);                                   \
    char gf_name##_queueCompressState(const struct NFA *nfa,                   \
                                      const struct mq *q, s64a loc);           \
    char gf_name##_expandState(const struct NFA *nfa, void *dest,              \
                               const void *src, u64a offset, u8 key);          \
-    enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa,    \
+    enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa,      \
-                                                   struct mq *q, s64a loc); \
+                                                   struct mq *q, s64a loc);    \
    GENERATE_NFA_DUMP_DECL(gf_name)
 GENERATE_NFA_DECL(nfaExecLimEx32_1)
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -38,6 +38,9 @@
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
 #include "multishufti.h"
 #include "multitruffle.h"
 #include "multivermicelli.h"
 #include "ue2common.h"
 #include "vermicelli.h"
 #include "util/bitutils.h"
@ -46,74 +49,6 @@
 #include "util/simd_utils_ssse3.h"
 #include "util/shuffle_ssse3.h"
 static
 const u8 *accelScan(const union AccelAux *aux, const u8 *ptr, const u8 *end) {
    assert(ISALIGNED(aux)); // must be SIMD aligned for shufti
    assert(end > ptr);
    assert(end - ptr >= 16); // must be at least 16 bytes to scan
    const u8 *start = ptr;
    u8 offset;
    switch (aux->accel_type) {
    case ACCEL_VERM:
        DEBUG_PRINTF("single vermicelli for 0x%02hhx\n", aux->verm.c);
        offset = aux->verm.offset;
        ptr = vermicelliExec(aux->verm.c, 0, ptr, end);
        break;
    case ACCEL_VERM_NOCASE:
        DEBUG_PRINTF("single vermicelli-nocase for 0x%02hhx\n", aux->verm.c);
        offset = aux->verm.offset;
        ptr = vermicelliExec(aux->verm.c, 1, ptr, end);
        break;
    case ACCEL_DVERM:
        DEBUG_PRINTF("double vermicelli for 0x%02hhx%02hhx\n",
                     aux->dverm.c1, aux->dverm.c2);
        offset = aux->dverm.offset;
        ptr = vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 0, ptr, end);
        break;
    case ACCEL_DVERM_NOCASE:
        DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
                     aux->dverm.c1, aux->dverm.c2);
        offset = aux->dverm.offset;
        ptr = vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2,
                                   1, ptr, end);
        break;
    case ACCEL_SHUFTI:
        DEBUG_PRINTF("single shufti\n");
        offset = aux->shufti.offset;
        ptr = shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
        break;
    case ACCEL_DSHUFTI:
        DEBUG_PRINTF("double shufti\n");
        offset = aux->dshufti.offset;
        ptr = shuftiDoubleExec(aux->dshufti.lo1, aux->dshufti.hi1,
                               aux->dshufti.lo2, aux->dshufti.hi2, ptr, end);
        break;
    case ACCEL_TRUFFLE:
        DEBUG_PRINTF("truffle shuffle\n");
        offset = aux->truffle.offset;
        ptr = truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
        break;
    case ACCEL_RED_TAPE:
        ptr = end; /* there is no escape */
        offset = aux->generic.offset;
        break;
    default:
        /* no acceleration, fall through and return current ptr */
        offset = 0;
        break;
    }
    if (offset) {
        ptr -= offset;
        if (ptr < start) {
            return start;
        }
    }
    return ptr;
 }
 static really_inline
 size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
                        const u8 *input, u32 idx, size_t i, size_t end) {
@ -134,7 +69,7 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
    }
    aux = aux + aux_idx;
-    const u8 *ptr = accelScan(aux, &input[i], &input[end]);
+    const u8 *ptr = run_accel(aux, &input[i], &input[end]);
    assert(ptr >= &input[i]);
    size_t j = (size_t)(ptr - input);
    DEBUG_PRINTF("accel skipped %zu of %zu chars\n", (j - i), (end - i));
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -80,9 +80,11 @@ struct precalcAccel {
    CharReach double_cr;
    flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
    u32 double_offset;
    MultibyteAccelInfo ma_info;
 };
-struct meteor_accel_info {
+struct limex_accel_info {
    ue2::unordered_set<NFAVertex> accelerable;
    map<NFAStateSet, precalcAccel> precalc;
    ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends;
@ -162,7 +164,7 @@ struct build_info {
    bool stateCompression;
    const CompileContext &cc;
    u32 num_states;
-    meteor_accel_info accel;
+    limex_accel_info accel;
 };
 // Constants for scoring mechanism
@ -334,12 +336,16 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }
 struct AccelBuild {
-    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0) {}
+    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0), ma_len1(0),
            ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
    NFAVertex v;
    u32 state;
    u32 offset; // offset correction to apply
    CharReach stop1; // single-byte accel stop literals
    flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
    u32 ma_len1; // multiaccel len1
    u32 ma_len2; // multiaccel len2
    MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
 };
 static
@ -354,7 +360,12 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
        build.stop1 = CharReach::dot();
    } else {
        const precalcAccel &precalc = bi.accel.precalc.at(ss);
-        if (precalc.double_lits.empty()) {
+        unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
        if (ma_len >= MULTIACCEL_MIN_LEN) {
            build.ma_len1 = precalc.ma_info.len1;
            build.stop1 = precalc.ma_info.cr;
            build.offset = precalc.ma_info.offset;
        } else if (precalc.double_lits.empty()) {
            build.stop1 = precalc.single_cr;
            build.offset = precalc.single_offset;
        } else {
@ -534,7 +545,7 @@ void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
 }
 static
-bool containsBadSubset(const meteor_accel_info &accel,
+bool containsBadSubset(const limex_accel_info &accel,
                       const NFAStateSet &state_set, const u32 effective_sds) {
    NFAStateSet subset(state_set.size());
    for (size_t j = state_set.find_first(); j != state_set.npos;
@ -555,11 +566,29 @@ bool containsBadSubset(const meteor_accel_info &accel,
 }
 static
-void doAccelCommon(NGHolder &g,
+bool is_too_wide(const AccelScheme &as) {
-                   ue2::unordered_map<NFAVertex, AccelScheme> &accel_map,
+    return as.cr.count() > MAX_MERGED_ACCEL_STOPS;
-                   const ue2::unordered_map<NFAVertex, u32> &state_ids,
+}
-                   const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
+
-                   const u32 num_states, meteor_accel_info *accel) {
+static
 void fillAccelInfo(build_info &bi) {
    if (!bi.do_accel) {
        return;
    }
    NGHolder &g = bi.h;
    limex_accel_info &accel = bi.accel;
    unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
    const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
    const CompileContext &cc = bi.cc;
    const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
    const u32 num_states = bi.num_states;
    nfaFindAccelSchemes(g, br_cyclic, &accel_map);
    filterAccelStates(g, bi.tops, &accel_map);
    assert(accel_map.size() <= NFA_MAX_ACCEL_STATES);
    vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
    vector<NFAVertex> astates;
@ -590,7 +619,7 @@ void doAccelCommon(NGHolder &g,
            }
        }
-        if (containsBadSubset(*accel, state_set, effective_sds)) {
+        if (containsBadSubset(accel, state_set, effective_sds)) {
            DEBUG_PRINTF("accel %u has bad subset\n", i);
            continue; /* if a subset failed to build we would too */
        }
@ -598,30 +627,37 @@ void doAccelCommon(NGHolder &g,
        const bool allow_wide = allow_wide_accel(states, g, sds_or_proxy);
        AccelScheme as = nfaFindAccel(g, states, refined_cr, br_cyclic,
-                                      allow_wide);
+                                      allow_wide, true);
-        if (as.cr.count() > MAX_MERGED_ACCEL_STOPS) {
+        if (is_too_wide(as)) {
            DEBUG_PRINTF("accel %u too wide (%zu, %d)\n", i,
                         as.cr.count(), MAX_MERGED_ACCEL_STOPS);
            continue;
        }
-        DEBUG_PRINTF("accel %u ok with offset %u\n", i, as.offset);
+        DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
                     as.double_offset);
-        precalcAccel &pa = accel->precalc[state_set];
+        // try multibyte acceleration first
-        pa.single_offset = as.offset;
+        MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
-        pa.single_cr = as.cr;
+
        precalcAccel &pa = accel.precalc[state_set];
        useful |= state_set;
-        if (states.size() == 1) {
+        // if we successfully built a multibyte accel scheme, use that
-            DoubleAccelInfo b = findBestDoubleAccelInfo(g, states.front());
+        if (mai.type != MultibyteAccelInfo::MAT_NONE) {
-            if (pa.single_cr.count() > b.stop1.count()) {
+            pa.ma_info = mai;
-                /* insert this information into the precalc accel info as it is
+
-                 * better than the single scheme */
+            DEBUG_PRINTF("multibyte acceleration!\n");
-                pa.double_offset = b.offset;
+            continue;
                pa.double_lits = b.stop2;
                pa.double_cr = b.stop1;
            }
        }
        pa.single_offset = as.offset;
        pa.single_cr = as.cr;
        if (as.double_byte.size() != 0) {
            pa.double_offset = as.double_offset;
            pa.double_lits = as.double_byte;
            pa.double_cr = as.double_cr;
        };
    }
    for (const auto &m : accel_map) {
@ -638,31 +674,22 @@ void doAccelCommon(NGHolder &g,
        state_set.reset();
        state_set.set(state_id);
-        auto p_it = accel->precalc.find(state_set);
+        bool is_multi = false;
-        if (p_it != accel->precalc.end()) {
+        auto p_it = accel.precalc.find(state_set);
        if (p_it != accel.precalc.end()) {
            const precalcAccel &pa = p_it->second;
            offset = max(pa.double_offset, pa.single_offset);
            is_multi = pa.ma_info.type != MultibyteAccelInfo::MAT_NONE;
            assert(offset <= MAX_ACCEL_DEPTH);
        }
-        accel->accelerable.insert(v);
+        accel.accelerable.insert(v);
-        findAccelFriends(g, v, br_cyclic, offset, &accel->friends[v]);
+        if (!is_multi) {
            findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
        }
    }
 }
 static
 void fillAccelInfo(build_info &bi) {
    if (!bi.do_accel) {
        return;
    }
    nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
    filterAccelStates(bi.h, bi.tops, &bi.accel.accel_map);
    assert(bi.accel.accel_map.size() <= NFA_MAX_ACCEL_STATES);
    doAccelCommon(bi.h, bi.accel.accel_map, bi.state_ids, bi.br_cyclic,
                  bi.num_states, &bi.accel);
 }
 /** The AccelAux structure has large alignment specified, and this makes some
 * compilers do odd things unless we specify a custom allocator. */
 typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)> >
@ -672,7 +699,7 @@ static
 void buildAccel(const build_info &args, NFAStateSet &accelMask,
                NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
                vector<u8> &accelTable) {
-    const meteor_accel_info &accel = args.accel;
+    const limex_accel_info &accel = args.accel;
    // Init, all zeroes.
    accelMask.resize(args.num_states);
@ -737,8 +764,16 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
        if (contains(accel.precalc, states)) {
            const precalcAccel &precalc = accel.precalc.at(states);
-            ainfo.single_offset = precalc.single_offset;
+            if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
-            ainfo.single_stops = precalc.single_cr;
+                ainfo.ma_len1 = precalc.ma_info.len1;
                ainfo.ma_len2 = precalc.ma_info.len2;
                ainfo.multiaccel_offset = precalc.ma_info.offset;
                ainfo.multiaccel_stops = precalc.ma_info.cr;
                ainfo.ma_type = precalc.ma_info.type;
            } else {
                ainfo.single_offset = precalc.single_offset;
                ainfo.single_stops = precalc.single_cr;
            }
        }
        buildAccelAux(ainfo, &aux);
@ -2152,7 +2187,7 @@ u32 countAccelStates(NGHolder &h,
    if (!cc.grey.allowLimExNFA) {
        DEBUG_PRINTF("limex not allowed\n");
-        return NFA_MAX_ACCEL_STATES + 1;
+        return 0;
    }
    // Sanity check the input data.
@ -2166,11 +2201,11 @@ u32 countAccelStates(NGHolder &h,
                  do_accel, state_compression, cc, num_states);
    // Acceleration analysis.
-    fillAccelInfo(bi);
+    nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
-    u32 num_accel = verify_u32(bi.accel.accelerable.size());
+    u32 num_accel = verify_u32(bi.accel.accel_map.size());
    DEBUG_PRINTF("found %u accel states\n", num_accel);
-    return min(num_accel, (u32)NFA_MAX_ACCEL_STATES);
+    return num_accel;
 }
 } // namespace ue2
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -79,11 +79,10 @@ aligned_unique_ptr<NFA> generate(NGHolder &g,
                        const CompileContext &cc);
 /**
- * \brief For a given graph, count the number of accel states it will have in
+ * \brief For a given graph, count the number of accelerable states it has.
 * an implementation.
 *
- * \return the number of accel states, or NFA_MAX_ACCEL_STATES + 1 if an
+ * Note that this number may be greater than the number that are actually
- * implementation would not be constructible.
+ * implementable.
 */
 u32 countAccelStates(NGHolder &h,
                     const ue2::unordered_map<NFAVertex, u32> &states,
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,6 @@
 #include "limex_internal.h"
 #include "nfa_api_util.h"
 #include "nfa_internal.h"
 #include "scratch.h"
 #include "util/uniform_ops.h"
 ////////////////////////////////////////////////////////////////////////////
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -176,8 +176,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
    const ReportID *exReports = getExReports(limex);
    const u32 *exceptionMap = limex->exceptionMap;
    assert(ISALIGNED_CL(ctx));
    assert(ISALIGNED_CL(&ctx->s));
    STATE_T s = LOAD_STATE(&ctx->s);
    /* assert(ISALIGNED_16(exceptions)); */
@ -533,17 +531,17 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
    assert(q->cur + 1 < q->end); /* require at least two items */
-    struct CONTEXT_T *ctx = q->scratch->nfaContext;
+    struct CONTEXT_T ctx;
-    assert(ISALIGNED_CL(ctx));
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
-    ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_state = q->streamState + limex->stateSize;
-    ctx->repeat_state = q->streamState + limex->stateSize;
+    ctx.callback = q->cb;
-    ctx->callback = q->cb;
+    ctx.context = q->context;
-    ctx->context = q->context;
+    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
+    ctx.cached_br = 0;
    assert(q->items[q->cur].location >= 0);
    DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
+    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
    assert(q->items[q->cur].type == MQE_START);
    u64a offset = q->offset;
@ -565,7 +563,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
        /* do main buffer region */
        DEBUG_PRINTF("MAIN BUFFER SCAN\n");
        assert(ep - offset <= q->length);
-        if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp)
+        if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp)
                == MO_HALT_MATCHING) {
            STORE_STATE(q->state, ZERO_STATE);
            return 0;
@ -584,19 +582,19 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
           q->items[q->cur].type = MQE_START;
           q->items[q->cur].location = sp - offset;
           DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
-           STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+           STORE_STATE(q->state, LOAD_STATE(&ctx.s));
           return MO_ALIVE;
       }
-        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp);
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
        q->cur++;
    }
-    EXPIRE_ESTATE_FN(limex, ctx, sp);
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
    DEBUG_PRINTF("END\n");
-    STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
    if (q->cur != q->end) {
        q->cur--;
@ -605,7 +603,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
        return MO_ALIVE;
    }
-    return ISNONZERO_STATE(LOAD_STATE(&ctx->s));
+    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
 }
 /* used by suffix execution in Rose */
@ -628,16 +626,16 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
    assert(q->cur + 1 < q->end); /* require at least two items */
-    struct CONTEXT_T *ctx = q->scratch->nfaContext;
+    struct CONTEXT_T ctx;
-    assert(ISALIGNED_CL(ctx));
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
-    ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_state = q->streamState + limex->stateSize;
-    ctx->repeat_state = q->streamState + limex->stateSize;
+    ctx.callback = q->cb;
-    ctx->callback = q->cb;
+    ctx.context = q->context;
-    ctx->context = q->context;
+    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
+    ctx.cached_br = 0;
    DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
+    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
    assert(q->items[q->cur].type == MQE_START);
    u64a offset = q->offset;
@ -661,7 +659,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
        /* do main buffer region */
        u64a final_look = 0;
        assert(ep - offset <= q->length);
-        if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp,
+        if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp,
                           &final_look) == MO_HALT_MATCHING) {
            DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu offset:%llu\n",
                         final_look, sp, end_abs, offset);
@ -669,7 +667,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
            q->cur--;
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = sp + final_look - offset;
-            STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+            STORE_STATE(q->state, LOAD_STATE(&ctx.s));
            return MO_MATCHES_PENDING;
        }
@ -685,19 +683,19 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = sp - offset;
            DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
-            STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+            STORE_STATE(q->state, LOAD_STATE(&ctx.s));
            return MO_ALIVE;
        }
-        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp);
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
        q->cur++;
    }
-    EXPIRE_ESTATE_FN(limex, ctx, sp);
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
    DEBUG_PRINTF("END\n");
-    STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
    if (q->cur != q->end) {
        q->cur--;
@ -706,7 +704,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
        return MO_ALIVE;
    }
-    return ISNONZERO_STATE(LOAD_STATE(&ctx->s));
+    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
 }
 // Used for execution Rose prefix/infixes.
@ -720,15 +718,16 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
    assert(q->cur + 1 < q->end); /* require at least two items */
-    struct CONTEXT_T *ctx = q->scratch->nfaContext;
+    struct CONTEXT_T ctx;
-    ctx->repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
+    ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T));
-    ctx->repeat_state = q->streamState + limex->stateSize;
+    ctx.repeat_state = q->streamState + limex->stateSize;
-    ctx->callback = NULL;
+    ctx.callback = NULL;
-    ctx->context = NULL;
+    ctx.context = NULL;
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
+    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
    ctx.cached_br = 0;
    DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
+    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
    assert(q->items[q->cur].type == MQE_START);
    u64a offset = q->offset;
@ -740,7 +739,7 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
        if (n->maxWidth) {
            if (ep - sp > n->maxWidth) {
                sp = ep - n->maxWidth;
-                STORE_STATE(&ctx->s, INITIAL_FN(limex, !!sp));
+                STORE_STATE(&ctx.s, INITIAL_FN(limex, !!sp));
            }
        }
        assert(ep >= sp);
@ -751,7 +750,7 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
            u64a local_ep = MIN(offset, ep);
            /* we are starting inside the history buffer */
            STREAMSILENT_FN(limex, q->history + q->hlength + sp - offset,
-                            local_ep - sp, ctx, sp);
+                            local_ep - sp, &ctx, sp);
            sp = local_ep;
        }
@ -763,30 +762,30 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
        /* do main buffer region */
        DEBUG_PRINTF("MAIN BUFFER SCAN\n");
        assert(ep - offset <= q->length);
-        STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, ctx, sp);
+        STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp);
        DEBUG_PRINTF("SCAN DONE\n");
    scan_done:
        sp = ep;
-        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, ctx, sp);
+        JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp);
        q->cur++;
    }
-    EXPIRE_ESTATE_FN(limex, ctx, sp);
+    EXPIRE_ESTATE_FN(limex, &ctx, sp);
    DEBUG_PRINTF("END, nfa is %s\n",
-                 ISNONZERO_STATE(ctx->s) ? "still alive" : "dead");
+                 ISNONZERO_STATE(ctx.s) ? "still alive" : "dead");
-    STORE_STATE(q->state, LOAD_STATE(&ctx->s));
+    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
-    if (JOIN(limexInAccept, SIZE)(limex, LOAD_STATE(&ctx->s), ctx->repeat_ctrl,
+    if (JOIN(limexInAccept, SIZE)(limex, LOAD_STATE(&ctx.s), ctx.repeat_ctrl,
-                                  ctx->repeat_state, sp + 1, report)) {
+                                  ctx.repeat_state, sp + 1, report)) {
        return MO_MATCHES_PENDING;
    }
-    return ISNONZERO_STATE(LOAD_STATE(&ctx->s));
+    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
 }
 char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
@ -813,42 +812,40 @@ char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
 // Block mode reverse scan.
 char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
-                                         const u8 *buf, size_t buflen,
+                                      const u8 *buf, size_t buflen,
-                                         const u8 *hbuf, size_t hlen,
+                                      const u8 *hbuf, size_t hlen,
-                                         struct hs_scratch *scratch,
+                                      NfaCallback cb, void *context) {
                                         NfaCallback cb, void *context) {
    assert(buf || hbuf);
    assert(buflen || hlen);
-    /* This may be called INSIDE another NFA, so we need a separate
+    struct CONTEXT_T ctx;
-     * context --> Hence the nfaContextSom */
+    ctx.repeat_ctrl = NULL;
-    struct CONTEXT_T *ctx = scratch->nfaContextSom;
+    ctx.repeat_state = NULL;
-    ctx->repeat_ctrl = NULL;
+    ctx.callback = cb;
-    ctx->repeat_state = NULL;
+    ctx.context = context;
-    ctx->callback = cb;
+    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
-    ctx->context = context;
+    ctx.cached_br = 0;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
    const IMPL_NFA_T *limex = getImplNfa(n);
-    STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored
+    STORE_STATE(&ctx.s, INITIAL_FN(limex, 0)); // always anchored
    // 'buf' may be null, for example when we're scanning at EOD time.
    if (buflen) {
        assert(buf);
        DEBUG_PRINTF("MAIN BUFFER SCAN, %zu bytes\n", buflen);
        offset -= buflen;
-        REV_STREAM_FN(limex, buf, buflen, ctx, offset);
+        REV_STREAM_FN(limex, buf, buflen, &ctx, offset);
    }
    if (hlen) {
        assert(hbuf);
        DEBUG_PRINTF("HISTORY BUFFER SCAN, %zu bytes\n", hlen);
        offset -= hlen;
-        REV_STREAM_FN(limex, hbuf, hlen, ctx, offset);
+        REV_STREAM_FN(limex, hbuf, hlen, &ctx, offset);
    }
-    if (offset == 0 && ISNONZERO_STATE(LOAD_STATE(&ctx->s))) {
+    if (offset == 0 && ISNONZERO_STATE(LOAD_STATE(&ctx.s))) {
-        TESTEOD_REV_FN(limex, &ctx->s, offset, cb, context);
+        TESTEOD_REV_FN(limex, &ctx.s, offset, cb, context);
    }
    // NOTE: return value is unused.
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -445,14 +445,15 @@ char mcclellanExec8_i_ni(const struct mcclellan *m, u8 *state, const u8 *buf,
 }
 static really_inline
-void mcclellanCheckEOD(const struct NFA *nfa, u16 s, u64a offset,
+char mcclellanCheckEOD(const struct NFA *nfa, u16 s, u64a offset,
                       NfaCallback cb, void *ctxt) {
    const struct mcclellan *m = getImplNfa(nfa);
    const struct mstate_aux *aux = get_aux(m, s);
-    if (aux->accept_eod) {
+    if (!aux->accept_eod) {
-        doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
+        return MO_CONTINUE_MATCHING;
    }
    return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
 }
 static really_inline
@ -1019,42 +1020,44 @@ void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
                                  const u8 *buf, char top, size_t start_off,
                                  size_t len, NfaCallback cb, void *ctxt) {
    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
-    if (top) {
+
-        *(u8 *)state = m->start_anchored;
+    u8 s = top ? m->start_anchored : *(u8 *)state;
    }
    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
-        mcclellanExec8_i(m, (u8 *)state, buf + start_off, len - start_off,
+        mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
                         start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
    } else {
-        mcclellanExec8_i(m, (u8 *)state, buf + start_off, len - start_off,
+        mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
                         start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
    }
    *(u8 *)state = s;
 }
 void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
                                   const u8 *buf, char top, size_t start_off,
                                   size_t len, NfaCallback cb, void *ctxt) {
    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
-    if (top) {
+
-        *(u16 *)state = m->start_anchored;
+    u16 s = top ? m->start_anchored : unaligned_load_u16(state);
    }
    if (m->flags & MCCLELLAN_FLAG_SINGLE) {
-        mcclellanExec16_i(m, (u16 *)state, buf + start_off, len - start_off,
+        mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
                         start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
    } else {
-        mcclellanExec16_i(m, (u16 *)state, buf + start_off, len - start_off,
+        mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
                         start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
    }
    unaligned_store_u16(state, s);
 }
 char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
                               UNUSED const char *streamState,
                               u64a offset, NfaCallback callback,
                               UNUSED SomNfaCallback som_cb, void *context) {
-    mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback, context);
+    return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback,
-    return 0;
+                             context);
 }
 char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
@ -1062,8 +1065,8 @@ char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
                                u64a offset, NfaCallback callback,
                                UNUSED SomNfaCallback som_cb, void *context) {
    assert(ISALIGNED_N(state, 2));
-    mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback, context);
+    return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback,
-    return 0;
+                             context);
 }
 char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,8 +29,11 @@
 #include "mcclellancompile.h"
 #include "accel.h"
 #include "accelcompile.h"
 #include "grey.h"
 #include "mcclellan_internal.h"
 #include "mcclellancompile_accel.h"
 #include "mcclellancompile_util.h"
 #include "nfa_internal.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
@ -43,6 +46,7 @@
 #include "util/container.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
 #include "util/unaligned.h"
 #include "util/verify_types.h"
@ -56,25 +60,18 @@
 #include <set>
 #include <vector>
 #include <boost/range/adaptor/map.hpp>
 using namespace std;
 using boost::adaptors::map_keys;
 namespace ue2 {
 /* compile time accel defs */
 #define ACCEL_MAX_STOP_CHAR 160 /* larger than nfa, as we don't have a budget
                                   and the nfa cheats on stop characters for
                                   sets of states */
 #define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
 namespace /* anon */ {
 struct dstate_extra {
-    u16 daddytaken;
+    u16 daddytaken = 0;
-    bool shermanState;
+    bool shermanState = false;
    bool accelerable;
    dstate_extra(void) : daddytaken(0), shermanState(false),
                         accelerable(false) {}
 };
 struct dfa_info {
@ -105,10 +102,6 @@ struct dfa_info {
        return extra[raw_id].shermanState;
    }
    bool is_accel(dstate_id_t raw_id) const {
        return extra[raw_id].accelerable;
    }
    size_t size(void) const { return states.size(); }
 };
@ -135,6 +128,13 @@ mstate_aux *getAux(NFA *n, dstate_id_t i) {
    return aux;
 }
 static
 bool double_byte_ok(const AccelScheme &info) {
    return !info.double_byte.empty()
        && info.double_cr.count() < info.double_byte.size()
        && info.double_cr.count() <= 2 && !info.double_byte.empty();
 }
 static
 void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
    assert((size_t)succ_table % 2 == 0);
@ -186,75 +186,45 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
    }
 }
-void mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx,
+u32 mcclellan_build_strat::max_allowed_offset_accel() const {
-                                                escape_info *out) const {
+    return ACCEL_DFA_MAX_OFFSET_DEPTH;
-    const dstate &raw = rdfa.states[this_idx];
+}
    const auto &alpha_remap = rdfa.alpha_remap;
-    flat_set<pair<u8, u8>> outs2_local;
+AccelScheme mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx)
-    for (unsigned i = 0; i < N_CHARS; i++) {
+    const {
-        outs2_local.clear();
+    return find_mcclellan_escape_info(rdfa, this_idx,
-
+                                      max_allowed_offset_accel());
        if (raw.next[alpha_remap[i]] != this_idx) {
            out->outs.set(i);
            DEBUG_PRINTF("next is %hu\n", raw.next[alpha_remap[i]]);
            const dstate &raw_next = rdfa.states[raw.next[alpha_remap[i]]];
            if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
                DEBUG_PRINTF("leads to report\n");
                out->outs2_broken = true;  /* cannot accelerate over reports */
            }
            for (unsigned j = 0; !out->outs2_broken && j < N_CHARS; j++) {
                if (raw_next.next[alpha_remap[j]] == raw.next[alpha_remap[j]]) {
                    continue;
                }
                DEBUG_PRINTF("adding %02x %02x -> %hu to 2 \n", i, j,
                             raw_next.next[alpha_remap[j]]);
                outs2_local.emplace((u8)i, (u8)j);
            }
            if (outs2_local.size() > 8) {
                DEBUG_PRINTF("adding %02x to outs2_single\n", i);
                out->outs2_single.set(i);
            } else {
                insert(&out->outs2, outs2_local);
            }
            if (out->outs2.size() > 8) {
                DEBUG_PRINTF("outs2 too big\n");
                out->outs2_broken = true;
            }
        }
    }
 }
 /** builds acceleration schemes for states */
-void mcclellan_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) {
+void mcclellan_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
                                       const AccelScheme &info,
                                       void *accel_out) {
    AccelAux *accel = (AccelAux *)accel_out;
    escape_info out;
-    find_escape_strings(this_idx, &out);
+    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
                 info.double_offset);
    accel->generic.offset = verify_u8(info.offset);
-    if (!out.outs2_broken && out.outs2_single.none()
+    if (double_byte_ok(info) && info.double_cr.none()
-        && out.outs2.size() == 1) {
+        && info.double_byte.size() == 1) {
        accel->accel_type = ACCEL_DVERM;
-        accel->dverm.c1 = out.outs2.begin()->first;
+        accel->dverm.c1 = info.double_byte.begin()->first;
-        accel->dverm.c2 = out.outs2.begin()->second;
+        accel->dverm.c2 = info.double_byte.begin()->second;
        accel->dverm.offset = verify_u8(info.double_offset);
        DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
        return;
    }
-    if (!out.outs2_broken && out.outs2_single.none()
+    if (double_byte_ok(info) && info.double_cr.none()
-        && (out.outs2.size() == 2 || out.outs2.size() == 4)) {
+        && (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
        bool ok = true;
-        assert(!out.outs2.empty());
+        assert(!info.double_byte.empty());
-        u8 firstC = out.outs2.begin()->first & CASE_CLEAR;
+        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
-        u8 secondC = out.outs2.begin()->second & CASE_CLEAR;
+        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
-        for (const pair<u8, u8> &p : out.outs2) {
+        for (const pair<u8, u8> &p : info.double_byte) {
            if ((p.first & CASE_CLEAR) != firstC
             || (p.second & CASE_CLEAR) != secondC) {
                ok = false;
@ -266,185 +236,76 @@ void mcclellan_build_strat::buildAccel(dstate_id_t this_idx, void *accel_out) {
            accel->accel_type = ACCEL_DVERM_NOCASE;
            accel->dverm.c1 = firstC;
            accel->dverm.c2 = secondC;
            accel->dverm.offset = verify_u8(info.double_offset);
            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
            return;
        }
        u8 m1;
        u8 m2;
        if (buildDvermMask(info.double_byte, &m1, &m2)) {
            accel->accel_type = ACCEL_DVERM_MASKED;
            accel->dverm.offset = verify_u8(info.double_offset);
            accel->dverm.c1 = info.double_byte.begin()->first & m1;
            accel->dverm.c2 = info.double_byte.begin()->second & m2;
            accel->dverm.m1 = m1;
            accel->dverm.m2 = m2;
            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
                         accel->dverm.c1, accel->dverm.c2);
            return;
        }
    }
-    if (!out.outs2_broken &&
+    if (double_byte_ok(info)
-        (out.outs2_single.count() + out.outs2.size()) <= 8 &&
+        && shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
-        out.outs2_single.count() < out.outs2.size() &&
+                                  &accel->dshufti.lo1, &accel->dshufti.hi1,
-        out.outs2_single.count() <= 2 && !out.outs2.empty()) {
+                                  &accel->dshufti.lo2, &accel->dshufti.hi2)) {
        accel->accel_type = ACCEL_DSHUFTI;
-        shuftiBuildDoubleMasks(out.outs2_single, out.outs2,
+        accel->dshufti.offset = verify_u8(info.double_offset);
                               &accel->dshufti.lo1,
                               &accel->dshufti.hi1,
                               &accel->dshufti.lo2,
                               &accel->dshufti.hi2);
        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
        return;
    }
-    if (out.outs.none()) {
+    if (info.cr.none()) {
        accel->accel_type = ACCEL_RED_TAPE;
        DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
                     " from which there is no escape\n", this_idx);
        return;
    }
-    if (out.outs.count() == 1) {
+    if (info.cr.count() == 1) {
        accel->accel_type = ACCEL_VERM;
-        accel->verm.c = out.outs.find_first();
+        accel->verm.c = info.cr.find_first();
        DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
        return;
    }
-    if (out.outs.count() == 2 && out.outs.isCaselessChar()) {
+    if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
        accel->accel_type = ACCEL_VERM_NOCASE;
-        accel->verm.c = out.outs.find_first() & CASE_CLEAR;
+        accel->verm.c = info.cr.find_first() & CASE_CLEAR;
        DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
        return;
    }
-    if (out.outs.count() > ACCEL_MAX_FLOATING_STOP_CHAR) {
+    if (info.cr.count() > ACCEL_DFA_MAX_FLOATING_STOP_CHAR) {
        accel->accel_type = ACCEL_NONE;
        DEBUG_PRINTF("state %hu is too broad\n", this_idx);
        return;
    }
    accel->accel_type = ACCEL_SHUFTI;
-    if (-1 != shuftiBuildMasks(out.outs, &accel->shufti.lo,
+    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo,
                               &accel->shufti.hi)) {
        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
        return;
    }
-    assert(!out.outs.none());
+    assert(!info.cr.none());
    accel->accel_type = ACCEL_TRUFFLE;
-    truffleBuildMasks(out.outs, &accel->truffle.mask1, &accel->truffle.mask2);
+    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
 }
 static
 bool is_accel(const raw_dfa &raw, dstate_id_t sds_or_proxy,
              dstate_id_t this_idx) {
    if (!this_idx /* dead state is not accelerable */) {
        return false;
    }
    /* Note on report acceleration states: While we can't accelerate while we
     * are spamming out callbacks, the QR code paths don't raise reports
     * during scanning so they can accelerate report states. */
    if (generates_callbacks(raw.kind)
        && !raw.states[this_idx].reports.empty()) {
        return false;
    }
    size_t single_limit = this_idx == sds_or_proxy ?
                             ACCEL_MAX_FLOATING_STOP_CHAR : ACCEL_MAX_STOP_CHAR;
    DEBUG_PRINTF("inspecting %hu/%hu: %zu\n", this_idx, sds_or_proxy,
                  single_limit);
    CharReach out;
    for (u32 i = 0; i < N_CHARS; i++) {
        if (raw.states[this_idx].next[raw.alpha_remap[i]] != this_idx) {
            out.set(i);
        }
    }
    if (out.count() <= single_limit) {
        DEBUG_PRINTF("state %hu should be accelerable %zu\n", this_idx,
                     out.count());
        return true;
    }
    DEBUG_PRINTF("state %hu is not accelerable has %zu\n", this_idx,
                  out.count());
    return false;
 }
 static
 bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
    u16 top_remap = raw.alpha_remap[TOP];
    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
        if (i != top_remap && raw.states[s].next[i] == s) {
            return true;
        }
    }
    return false;
 }
 static
 dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
    if (raw.start_floating != DEAD_STATE) {
        DEBUG_PRINTF("has floating start\n");
        return raw.start_floating;
    }
    DEBUG_PRINTF("looking for SDS proxy\n");
    dstate_id_t s = raw.start_anchored;
    if (has_self_loop(s, raw)) {
        return s;
    }
    u16 top_remap = raw.alpha_remap[TOP];
    ue2::unordered_set<dstate_id_t> seen;
    while (true) {
        seen.insert(s);
        DEBUG_PRINTF("basis %hu\n", s);
        /* check if we are connected to a state with a self loop */
        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
            dstate_id_t t = raw.states[s].next[i];
            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
                return t;
            }
        }
        /* find a neighbour to use as a basis for looking for the sds proxy */
        dstate_id_t t = DEAD_STATE;
        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
            dstate_id_t tt = raw.states[s].next[i];
            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
                t = tt;
                break;
            }
        }
        if (t == DEAD_STATE) {
            /* we were unable to find a state to use as a SDS proxy */
            return DEAD_STATE;
        }
        s = t;
        seen.insert(t);
    }
 }
 static
 void populateAccelerationInfo(dfa_info &info, u32 *ac, const Grey &grey) {
    *ac = 0; /* number of accelerable states */
    if (!grey.accelerateDFA) {
        return;
    }
    dstate_id_t sds_proxy = get_sds_or_proxy(info.raw);
    DEBUG_PRINTF("sds %hu\n", sds_proxy);
    for (size_t i = 0; i < info.size(); i++) {
        if (is_accel(info.raw, sds_proxy, i)) {
            ++*ac;
            info.extra[i].accelerable = true;
        }
    }
 }
 static
 void populateBasicInfo(size_t state_size, const dfa_info &info,
                       u32 total_size, u32 aux_offset, u32 accel_offset,
@ -496,8 +357,16 @@ namespace {
 struct raw_report_list {
    flat_set<ReportID> reports;
-    explicit raw_report_list(const flat_set<ReportID> &reports_in)
+    raw_report_list(const flat_set<ReportID> &reports_in,
-        : reports(reports_in) {}
+                    const ReportManager &rm, bool do_remap) {
        if (do_remap) {
            for (auto &id : reports_in) {
                reports.insert(rm.getProgramOffset(id));
            }
        } else {
            reports = reports_in;
        }
    }
    bool operator<(const raw_report_list &b) const {
        return reports < b.reports;
@ -520,6 +389,8 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
                                                  ReportID *arbReport) const {
    DEBUG_PRINTF("gathering reports\n");
    const bool remap_reports = has_managed_reports(rdfa.kind);
    auto ri = ue2::make_unique<raw_report_info_impl>();
    map<raw_report_list, u32> rev;
@ -529,7 +400,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
            continue;
        }
-        raw_report_list rrl(s.reports);
+        raw_report_list rrl(s.reports, rm, remap_reports);
        DEBUG_PRINTF("non empty r\n");
        if (rev.find(rrl) != rev.end()) {
            reports.push_back(rev[rrl]);
@ -548,7 +419,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
        }
        DEBUG_PRINTF("non empty r eod\n");
-        raw_report_list rrl(s.reports_eod);
+        raw_report_list rrl(s.reports_eod, rm, remap_reports);
        if (rev.find(rrl) != rev.end()) {
            reports_eod.push_back(rev[rrl]);
            continue;
@ -625,6 +496,14 @@ void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
    }
 }
 static
 void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
                  set<dstate_id_t> *accel_states) {
    for (dstate_id_t i : accel_escape_info | map_keys) {
        accel_states->insert(i);
    }
 }
 static
 size_t calcShermanRegionSize(const dfa_info &info) {
    size_t rv = 0;
@ -692,14 +571,14 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
 static
 aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
-                                           const CompileContext &cc) {
+                                           const CompileContext &cc,
                                           set<dstate_id_t> *accel_states) {
    DEBUG_PRINTF("building mcclellan 16\n");
    vector<u32> reports; /* index in ri for the appropriate report list */
    vector<u32> reports_eod; /* as above */
    ReportID arb;
    u8 single;
    u32 accelCount;
    u8 alphaShift = info.getAlphaShift();
    assert(alphaShift <= 8);
@ -711,9 +590,9 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
        return nullptr;
    }
-    unique_ptr<raw_report_info> ri
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
-        = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+    map<dstate_id_t, AccelScheme> accel_escape_info
-    populateAccelerationInfo(info, &accelCount, cc.grey);
+        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
    size_t tran_size = (1 << info.getAlphaShift())
        * sizeof(u16) * count_real_states;
@ -721,7 +600,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    size_t aux_size = sizeof(mstate_aux) * info.size();
    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size);
-    size_t accel_size = info.strat.accelSize() * accelCount;
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
                                    + ri->getReportListSize(), 32);
    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
@ -736,7 +615,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    char *nfa_base = (char *)nfa.get();
    populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
-                      accelCount, arb, single, nfa.get());
+                      accel_escape_info.size(), arb, single, nfa.get());
    vector<u32> reportOffsets;
@ -769,12 +648,12 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
        fillInAux(&aux[fs], i, info, reports, reports_eod, reportOffsets);
-        if (info.is_accel(i)) {
+        if (contains(accel_escape_info, i)) {
            this_aux->accel_offset = accel_offset;
            accel_offset += info.strat.accelSize();
            assert(accel_offset + sizeof(NFA) <= sherman_offset);
            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
-            info.strat.buildAccel(i,
+            info.strat.buildAccel(i, accel_escape_info.at(i),
                                  (void *)((char *)m + this_aux->accel_offset));
        }
    }
@ -798,12 +677,12 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
-        if (info.is_accel(i)) {
+        if (contains(accel_escape_info, i)) {
            this_aux->accel_offset = accel_offset;
            accel_offset += info.strat.accelSize();
            assert(accel_offset + sizeof(NFA) <= sherman_offset);
            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
-            info.strat.buildAccel(i,
+            info.strat.buildAccel(i, accel_escape_info.at(i),
                                  (void *)((char *)m + this_aux->accel_offset));
        }
@ -836,6 +715,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    markEdges(nfa.get(), succ_table, info);
    if (accel_states && nfa) {
        fillAccelOut(accel_escape_info, accel_states);
    }
    return nfa;
 }
@ -874,7 +757,9 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
 }
 static
-void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) {
+void allocateFSN8(dfa_info &info,
                  const map<dstate_id_t, AccelScheme> &accel_escape_info,
                  u16 *accel_limit, u16 *accept_limit) {
    info.states[0].impl_id = 0; /* dead is always 0 */
    vector<dstate_id_t> norm;
@ -886,7 +771,7 @@ void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) {
    for (u32 i = 1; i < info.size(); i++) {
        if (!info.states[i].reports.empty()) {
            accept.push_back(i);
-        } else if (info.is_accel(i)) {
+        } else if (contains(accel_escape_info, i)) {
            accel.push_back(i);
        } else {
            norm.push_back(i);
@ -915,23 +800,23 @@ void allocateFSN8(dfa_info &info, u16 *accel_limit, u16 *accept_limit) {
 static
 aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
-                                          const CompileContext &cc) {
+                                          const CompileContext &cc,
                                          set<dstate_id_t> *accel_states) {
    DEBUG_PRINTF("building mcclellan 8\n");
    vector<u32> reports;
    vector<u32> reports_eod;
    ReportID arb;
    u8 single;
    u32 accelCount;
-    unique_ptr<raw_report_info> ri
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
-        = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+    map<dstate_id_t, AccelScheme> accel_escape_info
-    populateAccelerationInfo(info, &accelCount, cc.grey);
+        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * info.size();
    size_t aux_size = sizeof(mstate_aux) * info.size();
    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size);
-    size_t accel_size = info.strat.accelSize() * accelCount;
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
                                     + ri->getReportListSize(), 32);
    size_t total_size = accel_offset + accel_size;
@ -951,9 +836,9 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
    mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get());
-    allocateFSN8(info, &m->accel_limit_8, &m->accept_limit_8);
+    allocateFSN8(info, accel_escape_info, &m->accel_limit_8, &m->accept_limit_8);
    populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
-                      accelCount, arb, single, nfa.get());
+                      accel_escape_info.size(), arb, single, nfa.get());
    vector<u32> reportOffsets;
@ -964,13 +849,14 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
    mstate_aux *aux = (mstate_aux *)(nfa_base + aux_offset);
    for (size_t i = 0; i < info.size(); i++) {
-        if (info.is_accel(i)) {
+        if (contains(accel_escape_info, i)) {
            u32 j = info.implId(i);
            aux[j].accel_offset = accel_offset;
            accel_offset += info.strat.accelSize();
-            info.strat.buildAccel(i, (void *)((char *)m + aux[j].accel_offset));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
                                  (void *)((char *)m + aux[j].accel_offset));
        }
        fillInBasicState8(info, aux, succ_table, reportOffsets, reports,
@ -981,6 +867,10 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
    DEBUG_PRINTF("rl size %zu\n", ri->size());
    if (accel_states && nfa) {
        fillAccelOut(accel_escape_info, accel_states);
    }
    return nfa;
 }
@ -1163,15 +1053,6 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
    return false;
 }
 static
 void fillAccelOut(const dfa_info &info, set<dstate_id_t> *accel_states) {
    for (size_t i = 0; i < info.size(); i++) {
        if (info.is_accel(i)) {
            accel_states->insert(i);
        }
    }
 }
 aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
                                           const CompileContext &cc,
                                           set<dstate_id_t> *accel_states) {
@ -1200,26 +1081,23 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
    aligned_unique_ptr<NFA> nfa;
    if (!using8bit) {
-        nfa = mcclellanCompile16(info, cc);
+        nfa = mcclellanCompile16(info, cc, accel_states);
    } else {
-        nfa = mcclellanCompile8(info, cc);
+        nfa = mcclellanCompile8(info, cc, accel_states);
    }
    if (has_eod_reports) {
        nfa->flags |= NFA_ACCEPTS_EOD;
    }
    if (accel_states && nfa) {
        fillAccelOut(info, accel_states);
    }
    DEBUG_PRINTF("compile done\n");
    return nfa;
 }
 aligned_unique_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
                                         const ReportManager &rm,
                                         set<dstate_id_t> *accel_states) {
-    mcclellan_build_strat mbs(raw);
+    mcclellan_build_strat mbs(raw, rm);
    return mcclellanCompile_i(raw, mbs, cc, accel_states);
 }
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,7 @@
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/accel_scheme.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
 #include "util/ue2_containers.h"
@ -43,6 +44,7 @@ struct NFA;
 namespace ue2 {
 class ReportManager;
 struct CompileContext;
 struct raw_report_info {
@ -54,15 +56,9 @@ struct raw_report_info {
                                 std::vector<u32> &ro /* out */) const = 0;
 };
 struct escape_info {
    CharReach outs;
    CharReach outs2_single;
    flat_set<std::pair<u8, u8>> outs2;
    bool outs2_broken = false;
 };
 class dfa_build_strat {
 public:
    explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
    virtual ~dfa_build_strat();
    virtual raw_dfa &get_raw() const = 0;
    virtual std::unique_ptr<raw_report_info> gatherReports(
@ -70,25 +66,29 @@ public:
                               std::vector<u32> &reports_eod /* out */,
                               u8 *isSingleReport /* out */,
                               ReportID *arbReport  /* out */) const = 0;
-    virtual void find_escape_strings(dstate_id_t this_idx,
+    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const = 0;
                                     escape_info *out) const = 0;
    virtual size_t accelSize(void) const = 0;
-    virtual void buildAccel(dstate_id_t this_idx, void *accel_out) = 0;
+    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                            void *accel_out) = 0;
 protected:
    const ReportManager &rm;
 };
 class mcclellan_build_strat : public dfa_build_strat {
 public:
-    explicit mcclellan_build_strat(raw_dfa &r) : rdfa(r) {}
+    mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
        : dfa_build_strat(rm_in), rdfa(rdfa_in) {}
    raw_dfa &get_raw() const override { return rdfa; }
    std::unique_ptr<raw_report_info> gatherReports(
-                                   std::vector<u32> &reports /* out */,
+                                  std::vector<u32> &reports /* out */,
-                                   std::vector<u32> &reports_eod /* out */,
+                                  std::vector<u32> &reports_eod /* out */,
-                                   u8 *isSingleReport /* out */,
+                                  u8 *isSingleReport /* out */,
-                                   ReportID *arbReport  /* out */) const override;
+                                  ReportID *arbReport  /* out */) const override;
-    void find_escape_strings(dstate_id_t this_idx,
+    AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
                             escape_info *out) const override;
    size_t accelSize(void) const override;
-    void buildAccel(dstate_id_t this_idx, void *accel_out) override;
+    void buildAccel(dstate_id_t this_idx,const AccelScheme &info,
                    void *accel_out) override;
    virtual u32 max_allowed_offset_accel() const;
 private:
    raw_dfa &rdfa;
@ -98,6 +98,7 @@ private:
 * states */
 ue2::aligned_unique_ptr<NFA>
 mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
                 const ReportManager &rm,
                 std::set<dstate_id_t> *accel_states = nullptr);
 /* used internally by mcclellan/haig/gough compile process */
--- a/src/nfa/mcclellancompile_accel.cpp
+++ b/src/nfa/mcclellancompile_accel.cpp
@ -0,0 +1,422 @@
 /*
 * Copyright (c) 2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "mcclellancompile_accel.h"
 #include "mcclellancompile_util.h"
 #include "grey.h"
 #include "nfagraph/ng_limex_accel.h"
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include <vector>
 #include <sstream>
 #define PATHS_LIMIT 500
 using namespace std;
 namespace ue2 {
 namespace {
 struct path {
    vector<CharReach> reach;
    dstate_id_t dest = DEAD_STATE;
    explicit path(dstate_id_t base) : dest(base) {}
 };
 }
 static UNUSED
 string describeClasses(const vector<CharReach> &v) {
    std::ostringstream oss;
    for (const auto &cr : v) {
        describeClass(oss, cr);
    }
    return oss.str();
 }
 static
 void dump_paths(const vector<path> &paths) {
    for (UNUSED const auto &p : paths) {
        DEBUG_PRINTF("[%s] -> %u\n", describeClasses(p.reach).c_str(), p.dest);
    }
    DEBUG_PRINTF("%zu paths\n", paths.size());
 }
 static
 bool is_useful_path(const vector<path> &good, const path &p) {
    for (const auto &g : good) {
        assert(g.dest == p.dest);
        assert(g.reach.size() <= p.reach.size());
        auto git = g.reach.rbegin();
        auto pit = p.reach.rbegin();
        for (; git != g.reach.rend(); ++git, ++pit) {
            if (!pit->isSubsetOf(*git)) {
                goto next;
            }
        }
        DEBUG_PRINTF("better: [%s] -> %u\n",
                     describeClasses(g.reach).c_str(), g.dest);
        return false;
    next:;
    }
    return true;
 }
 static
 path append(const path &orig, const CharReach &cr, u32 new_dest) {
    path p(new_dest);
    p.reach = orig.reach;
    p.reach.push_back(cr);
    return p;
 }
 static
 void extend(const raw_dfa &rdfa, const path &p,
            map<u32, vector<path> > &all,
            vector<path> &out) {
    dstate s = rdfa.states[p.dest];
    if (!p.reach.empty() && p.reach.back().none()) {
        out.push_back(p);
        return;
    }
    if (!s.reports.empty()) {
        if (generates_callbacks(rdfa.kind)) {
            out.push_back(p);
            return;
        } else {
            path pp = append(p, CharReach(), p.dest);
            all[p.dest].push_back(pp);
            out.push_back(pp);
        }
    }
    if (!s.reports_eod.empty()) {
        path pp = append(p, CharReach(), p.dest);
        all[p.dest].push_back(pp);
        out.push_back(pp);
    }
    map<u32, CharReach> dest;
    for (unsigned i = 0; i < N_CHARS; i++) {
        u32 succ = s.next[rdfa.alpha_remap[i]];
        dest[succ].set(i);
    }
    for (const auto &e : dest) {
        path pp = append(p, e.second, e.first);
        if (!is_useful_path(all[e.first], pp)) {
            DEBUG_PRINTF("not useful: [%s] -> %u\n",
                         describeClasses(pp.reach).c_str(), pp.dest);
            continue;
        }
        DEBUG_PRINTF("----good: [%s] -> %u\n",
                         describeClasses(pp.reach).c_str(), pp.dest);
        all[e.first].push_back(pp);
        out.push_back(pp);
    }
 }
 static
 vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
                                          u32 len) {
    vector<path> paths{ path(base) };
    map<u32, vector<path> > all;
    all[base].push_back(path(base));
    for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
        vector<path> next_gen;
        for (const auto &p : paths) {
            extend(rdfa, p, all, next_gen);
        }
        paths = move(next_gen);
    }
    dump_paths(paths);
    vector<vector<CharReach> > rv;
    for (auto &p : paths) {
        rv.push_back(move(p.reach));
    }
    return rv;
 }
 static
 AccelScheme look_for_offset_accel(const raw_dfa &rdfa, dstate_id_t base,
                                  u32 max_allowed_accel_offset) {
    DEBUG_PRINTF("looking for accel for %hu\n", base);
    vector<vector<CharReach> > paths = generate_paths(rdfa, base,
                                                   max_allowed_accel_offset + 1);
    AccelScheme as = findBestAccelScheme(paths, CharReach(), true);
    DEBUG_PRINTF("found %s + %u\n", describeClass(as.cr).c_str(), as.offset);
    return as;
 }
 static
 vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
                                 const CharReach &escape) {
    set<u16> rv;
    CharReach nonexit = ~escape;
    for (auto i = nonexit.find_first(); i != CharReach::npos;
         i = nonexit.find_next(i)) {
        rv.insert(rdfa.alpha_remap[i]);
    }
    return vector<u16>(rv.begin(), rv.end());
 }
 static
 set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
                             const AccelScheme &ei) {
    DEBUG_PRINTF("looking for region around %hu\n", base);
    set<dstate_id_t> region = {base};
    if (!ei.double_byte.empty()) {
        return region;
    }
    DEBUG_PRINTF("accel %s+%u\n", describeClass(ei.cr).c_str(), ei.offset);
    const CharReach &escape = ei.cr;
    auto nonexit_symbols = find_nonexit_symbols(rdfa, escape);
    vector<dstate_id_t> pending = {base};
    while (!pending.empty()) {
        dstate_id_t curr = pending.back();
        pending.pop_back();
        for (auto s : nonexit_symbols) {
            dstate_id_t t = rdfa.states[curr].next[s];
            if (contains(region, t)) {
                continue;
            }
            DEBUG_PRINTF("    %hu is in region\n", t);
            region.insert(t);
            pending.push_back(t);
        }
    }
    return region;
 }
 static
 bool better(const AccelScheme &a, const AccelScheme &b) {
    if (!a.double_byte.empty() && b.double_byte.empty()) {
        return true;
    }
    if (!b.double_byte.empty()) {
        return false;
    }
    return a.cr.count() < b.cr.count();
 }
 static
 vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
    for (u32 i = 0; i < N_CHARS; i++) {
        rv.at(rdfa.alpha_remap[i]).set(i);
    }
    return rv;
 }
 map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
                                                   const dfa_build_strat &strat,
                                                   const Grey &grey) {
    map<dstate_id_t, AccelScheme> rv;
    if (!grey.accelerateDFA) {
        return rv;
    }
    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
    DEBUG_PRINTF("sds %hu\n", sds_proxy);
    for (size_t i = 0; i < rdfa.states.size(); i++) {
        if (i == DEAD_STATE) {
            continue;
        }
        /* Note on report acceleration states: While we can't accelerate while we
         * are spamming out callbacks, the QR code paths don't raise reports
         * during scanning so they can accelerate report states. */
        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
            continue;
        }
        size_t single_limit = i == sds_proxy ? ACCEL_DFA_MAX_FLOATING_STOP_CHAR
                                             : ACCEL_DFA_MAX_STOP_CHAR;
        DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
        AccelScheme ei = strat.find_escape_strings(i);
        if (ei.cr.count() > single_limit) {
            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
                         ei.cr.count());
            continue;
        }
        DEBUG_PRINTF("state %zu should be accelerable %zu\n",
                     i, ei.cr.count());
        rv[i] = ei;
    }
    /* provide accleration states to states in the region of sds */
    if (contains(rv, sds_proxy)) {
        AccelScheme sds_ei = rv[sds_proxy];
        sds_ei.double_byte.clear(); /* region based on single byte scheme
                                     * may differ from double byte */
        DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
                     sds_ei.cr.count());
        auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
        for (auto s : sds_region) {
            if (!contains(rv, s) || better(sds_ei, rv[s])) {
                rv[s] = sds_ei;
            }
        }
    }
    return rv;
 }
 static
 bool double_byte_ok(const AccelScheme &info) {
    return !info.double_byte.empty()
        && info.double_cr.count() < info.double_byte.size()
        && info.double_cr.count() <= 2 && !info.double_byte.empty();
 }
 AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx,
                                       u32 max_allowed_accel_offset) {
    AccelScheme rv;
    rv.cr.clear();
    rv.offset = 0;
    const dstate &raw = rdfa.states[this_idx];
    const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
    bool outs2_broken = false;
    map<dstate_id_t, CharReach> succs;
    for (u32 i = 0; i < rev_map.size(); i++) {
        if (raw.next[i] == this_idx) {
            continue;
        }
        const CharReach &cr_i = rev_map.at(i);
        rv.cr |= cr_i;
        dstate_id_t next_id = raw.next[i];
        DEBUG_PRINTF("next is %hu\n", next_id);
        const dstate &raw_next = rdfa.states[next_id];
        if (outs2_broken) {
            continue;
        }
        if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
            DEBUG_PRINTF("leads to report\n");
            outs2_broken = true;  /* cannot accelerate over reports */
            continue;
        }
        succs[next_id] |= cr_i;
    }
    if (!outs2_broken) {
        for (const auto &e : succs) {
            const CharReach &cr_i = e.second;
            const dstate &raw_next = rdfa.states[e.first];
            CharReach cr_all_j;
            for (u32 j = 0; j < rev_map.size(); j++) {
                if (raw_next.next[j] == raw.next[j]) {
                    continue;
                }
                DEBUG_PRINTF("state %hu: adding sym %u -> %hu to 2 \n", e.first,
                             j, raw_next.next[j]);
                cr_all_j |= rev_map.at(j);
            }
            if (cr_i.count() * cr_all_j.count() > 8) {
                DEBUG_PRINTF("adding %zu to double_cr\n", cr_i.count());
                rv.double_cr |= cr_i;
            } else {
                for (auto ii = cr_i.find_first(); ii != CharReach::npos;
                     ii = cr_i.find_next(ii)) {
                    for (auto jj = cr_all_j.find_first(); jj != CharReach::npos;
                         jj = cr_all_j.find_next(jj)) {
                        rv.double_byte.emplace((u8)ii, (u8)jj);
                    }
                }
            }
        }
        if (rv.double_byte.size() > 8) {
            DEBUG_PRINTF("outs2 too big\n");
            outs2_broken = true;
        }
        if (outs2_broken) {
            rv.double_byte.clear();
        }
    }
    DEBUG_PRINTF("this %u, sds proxy %hu\n", this_idx, get_sds_or_proxy(rdfa));
    DEBUG_PRINTF("broken %d\n", outs2_broken);
    if (!double_byte_ok(rv) && !is_triggered(rdfa.kind)
        && this_idx == rdfa.start_floating
        && this_idx != DEAD_STATE) {
        DEBUG_PRINTF("looking for offset accel at %u\n", this_idx);
        auto offset = look_for_offset_accel(rdfa, this_idx,
                                            max_allowed_accel_offset);
        DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(),
                      rv.cr.count());
        if (double_byte_ok(offset) || offset.cr.count() < rv.cr.count()) {
            DEBUG_PRINTF("using offset accel\n");
            rv = offset;
        }
    }
    return rv;
 }
 }
--- a/src/nfa/mcclellancompile_accel.h
+++ b/src/nfa/mcclellancompile_accel.h
@ -0,0 +1,61 @@
 /*
 * Copyright (c) 2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MCCLELLANCOMPILE_ACCEL_H
 #define MCCLELLANCOMPILE_ACCEL_H
 #include "mcclellancompile.h"
 #include <map>
 namespace ue2 {
 struct Grey;
 #define ACCEL_DFA_MAX_OFFSET_DEPTH 4
 /** Maximum tolerated number of escape character from an accel state.
 * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
 * characters for sets of states */
 #define ACCEL_DFA_MAX_STOP_CHAR 160
 /** Maximum tolerated number of escape character from a sds accel state. Larger
 * than normal states as accelerating sds is important. Matches NFA value */
 #define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
 std::map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
                                                   const dfa_build_strat &strat,
                                                   const Grey &grey);
 AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa,
                                       dstate_id_t this_idx,
                                       u32 max_allowed_accel_offset);
 }
 #endif
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -228,13 +228,13 @@ void calc_min_dist_to_accept(const raw_dfa &raw,
    }
 }
-void prune_overlong(raw_dfa &raw, u32 max_offset) {
+bool prune_overlong(raw_dfa &raw, u32 max_offset) {
    DEBUG_PRINTF("pruning to at most %u\n", max_offset);
    vector<u32> bob_dist;
    u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
    if (max_min_dist_bob <= max_offset) {
-        return;
+        return false;
    }
    vector<vector<dstate_id_t> > in_edges;
@ -282,6 +282,8 @@ void prune_overlong(raw_dfa &raw, u32 max_offset) {
    /* update specials */
    raw.start_floating = new_ids[raw.start_floating];
    raw.start_anchored = new_ids[raw.start_anchored];
    return true;
 }
 set<ReportID> all_reports(const raw_dfa &rdfa) {
@ -334,4 +336,63 @@ size_t hash_dfa(const raw_dfa &rdfa) {
    return v;
 }
 static
 bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
    u16 top_remap = raw.alpha_remap[TOP];
    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
        if (i != top_remap && raw.states[s].next[i] == s) {
            return true;
        }
    }
    return false;
 }
 dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
    if (raw.start_floating != DEAD_STATE) {
        DEBUG_PRINTF("has floating start\n");
        return raw.start_floating;
    }
    DEBUG_PRINTF("looking for SDS proxy\n");
    dstate_id_t s = raw.start_anchored;
    if (has_self_loop(s, raw)) {
        return s;
    }
    u16 top_remap = raw.alpha_remap[TOP];
    ue2::unordered_set<dstate_id_t> seen;
    while (true) {
        seen.insert(s);
        DEBUG_PRINTF("basis %hu\n", s);
        /* check if we are connected to a state with a self loop */
        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
            dstate_id_t t = raw.states[s].next[i];
            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
                return t;
            }
        }
        /* find a neighbour to use as a basis for looking for the sds proxy */
        dstate_id_t t = DEAD_STATE;
        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
            dstate_id_t tt = raw.states[s].next[i];
            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
                t = tt;
                break;
            }
        }
        if (t == DEAD_STATE) {
            /* we were unable to find a state to use as a SDS proxy */
            return DEAD_STATE;
        }
        s = t;
    }
 }
 } // namespace ue2
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,16 +29,21 @@
 #ifndef MCCLELLAN_COMPILE_UTIL_H
 #define MCCLELLAN_COMPILE_UTIL_H
 #include "rdfa.h"
 #include "ue2common.h"
 #include <set>
 namespace ue2 {
 struct raw_dfa;
 u32 remove_leading_dots(raw_dfa &raw);
-void prune_overlong(raw_dfa &raw, u32 max_offset);
+
 /**
 * Prunes any states which cannot be reached within max_offset from start of
 * stream. Returns false if no changes are made to the rdfa
 */
 bool prune_overlong(raw_dfa &raw, u32 max_offset);
 std::set<ReportID> all_reports(const raw_dfa &rdfa);
 bool has_eod_accepts(const raw_dfa &rdfa);
 bool has_non_eod_accepts(const raw_dfa &rdfa);
@ -50,6 +55,8 @@ size_t hash_dfa_no_reports(const raw_dfa &rdfa);
 /** \brief Compute a simple hash of this raw_dfa, including its reports. */
 size_t hash_dfa(const raw_dfa &rdfa);
 dstate_id_t get_sds_or_proxy(const raw_dfa &raw);
 } // namespace ue2
 #endif
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -124,7 +124,7 @@ char processReports(const struct mpv *m, u8 *reporters,
                DEBUG_PRINTF("report %u at %llu\n", curr->report,
                              report_offset);
-                if (curr->unbounded) {
+                if (curr->unbounded && !curr->simple_exhaust) {
                    assert(rl_count < m->puffette_count);
                    *rl = curr->report;
                    ++rl;
@ -176,7 +176,9 @@ char processReportsForRange(const struct mpv *m, u8 *reporters,
        return MO_CONTINUE_MATCHING;
    }
-    for (u32 i = 2; i <= length; i++) {
+    DEBUG_PRINTF("length=%zu, rl_count=%u\n", length, rl_count);
    for (size_t i = 2; i <= length; i++) {
        for (u32 j = 0; j < rl_count; j++) {
            if (cb(first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
                DEBUG_PRINTF("bailing\n");
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -108,6 +108,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
        fprintf(f, "    Puffette %u\n", i);
        fprintf(f, "        repeats:   %u%s\n", p[i].repeats,
                p[i].unbounded ? "," : "");
        if (p[i].simple_exhaust) {
            fprintf(f, "        simple exhaustible\n");
        }
        fprintf(f, "        report id: %u\n", p[i].report);
    }
--- a/src/nfa/mpv_internal.h
+++ b/src/nfa/mpv_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -40,6 +40,15 @@
 struct mpv_puffette {
    u32 repeats;
    char unbounded;
    /**
     * \brief Report is simple-exhaustible.
     *
     * If this is true, we do best-effort suppression of runs of reports, only
     * delivering the first one.
     */
    char simple_exhaust;
    ReportID report;
 };
--- a/src/nfa/mpvcompile.cpp
+++ b/src/nfa/mpvcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -36,6 +36,7 @@
 #include "util/alloc.h"
 #include "util/multibit_internal.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
 #include <algorithm>
@ -53,10 +54,8 @@ namespace ue2 {
 namespace {
 struct pcomp {
    bool operator()(const raw_puff &a, const raw_puff &b) const {
-        ORDER_CHECK(repeats);
+        return tie(a.repeats, a.unbounded, a.simple_exhaust, a.report) <
-        ORDER_CHECK(unbounded);
+               tie(b.repeats, b.unbounded, b.simple_exhaust, b.report);
        ORDER_CHECK(report);
        return false;
    }
 };
@ -84,12 +83,21 @@ struct ClusterKey {
 } // namespace
 static
-void writePuffette(mpv_puffette *out, const raw_puff &rp) {
+void writePuffette(mpv_puffette *out, const raw_puff &rp,
                   const ReportManager &rm) {
    DEBUG_PRINTF("outputting %u %d %u to %p\n", rp.repeats, (int)rp.unbounded,
                 rp.report, out);
    out->repeats = rp.repeats;
    out->unbounded = rp.unbounded;
-    out->report = rp.report;
+    out->simple_exhaust = rp.simple_exhaust;
    out->report = rm.getProgramOffset(rp.report);
 }
 static
 void writeSentinel(mpv_puffette *out) {
    DEBUG_PRINTF("outputting sentinel to %p\n", out);
    memset(out, 0, sizeof(*out));
    out->report = INVALID_REPORT;
 }
 static
@ -148,8 +156,8 @@ void populateClusters(const vector<raw_puff> &puffs_in,
 static
 void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
-                   u32 counter_offset, mpv *m, mpv_kilopuff *kp,
+                   const ReportManager &rm, u32 counter_offset, mpv *m,
-                   mpv_puffette **pa) {
+                   mpv_kilopuff *kp, mpv_puffette **pa) {
    const CharReach &reach = it->first.reach;
    const vector<raw_puff> &puffs = it->second;
@ -182,11 +190,11 @@ void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
    kp->puffette_offset = verify_u32((char *)*pa - (char *)m);
    for (size_t i = 0; i < puffs.size(); i++) {
        assert(!it->first.auto_restart || puffs[i].unbounded);
-        writePuffette(*pa + i, puffs[i]);
+        writePuffette(*pa + i, puffs[i], rm);
    }
    *pa += puffs.size();
-    writePuffette(*pa, raw_puff(0U, false, INVALID_REPORT, CharReach()));
+    writeSentinel(*pa);
    ++*pa;
    writeDeadPoint(kp, puffs);
@ -301,7 +309,8 @@ const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters,
 }
 aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
-                                   const vector<raw_puff> &triggered_puffs) {
+                                   const vector<raw_puff> &triggered_puffs,
                                   const ReportManager &rm) {
    assert(!puffs_in.empty() || !triggered_puffs.empty());
    u32 puffette_count = puffs_in.size() + triggered_puffs.size();
@ -341,7 +350,7 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
         + sizeof(mpv_counter_info) * counters.size());
    mpv_puffette *pa = pa_base;
-    writePuffette(pa, raw_puff(0U, false, INVALID_REPORT, CharReach()));
+    writeSentinel(pa);
    ++pa; /* skip init sentinel */
@ -367,8 +376,9 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
    mpv_kilopuff *kp_begin = (mpv_kilopuff *)(m + 1);
    mpv_kilopuff *kp = kp_begin;
    for (auto it = puff_clusters.begin(); it != puff_clusters.end(); ++it) {
-        writeKiloPuff(it, findCounter(counters, kp - kp_begin).counter_offset,
+        writeKiloPuff(it, rm,
-                      m, kp, &pa);
+                      findCounter(counters, kp - kp_begin).counter_offset, m,
                      kp, &pa);
        ++kp;
    }
    assert((char *)pa == (char *)nfa.get() + len);
--- a/src/nfa/mpvcompile.h
+++ b/src/nfa/mpvcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -40,14 +40,19 @@ struct NFA;
 namespace ue2 {
 class ReportManager;
 struct raw_puff {
    raw_puff(u32 repeats_in, bool unbounded_in, ReportID report_in,
-             const CharReach &reach_in, bool auto_restart_in = false)
+             const CharReach &reach_in, bool auto_restart_in = false,
             bool simple_exhaust_in = false)
        : repeats(repeats_in), unbounded(unbounded_in),
-          auto_restart(auto_restart_in), report(report_in), reach(reach_in) {}
+          auto_restart(auto_restart_in), simple_exhaust(simple_exhaust_in),
          report(report_in), reach(reach_in) {}
    u32 repeats; /**< report match after this many matching bytes */
    bool unbounded; /**< keep producing matches after repeats are reached */
    bool auto_restart; /**< for /[^X]{n}/ type patterns */
    bool simple_exhaust; /* first report will exhaust us */
    ReportID report;
    CharReach reach; /**< = ~escapes */
 };
@ -56,9 +61,9 @@ struct raw_puff {
 * puffs in the triggered_puffs vector are enabled when an TOP_N event is
 * delivered corresponding to their index in the vector
 */
-aligned_unique_ptr<NFA>
+aligned_unique_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
-mpvCompile(const std::vector<raw_puff> &puffs,
+                                   const std::vector<raw_puff> &triggered_puffs,
-           const std::vector<raw_puff> &triggered_puffs);
+                                   const ReportManager &rm);
 } // namespace ue2
--- a/src/nfa/multiaccel_common.h
+++ b/src/nfa/multiaccel_common.h
@ -0,0 +1,265 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_COMMON_H_
 #define MULTIACCEL_COMMON_H_
 #include "config.h"
 #include "ue2common.h"
 #include "util/join.h"
 #include "util/bitutils.h"
 /*
 * When doing shifting, remember that the total number of shifts should be n-1
 */
 #define VARISHIFT(src, dst, len) \
    do { \
        (dst) &= (src) >> (len); \
    } while (0)
 #define STATIC_SHIFT1(x) \
    do { \
        (x) &= (x) >> 1; \
    } while (0)
 #define STATIC_SHIFT2(x) \
    do { \
        (x) &= (x) >> 2;\
    } while (0)
 #define STATIC_SHIFT4(x) \
    do { \
        (x) &= (x) >> 4; \
    } while (0)
 #define STATIC_SHIFT8(x) \
    do { \
        (x) &= (x) >> 8; \
    } while (0)
 #define SHIFT1(x) \
    do {} while (0)
 #define SHIFT2(x) \
    do { \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT3(x) \
    do { \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT4(x) \
    do { \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT5(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT6(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT7(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT8(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT9(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT10(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT11(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT12(x); \
    do { \
        SHIFT8(x);\
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT13(x); \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT14(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT15(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT16(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 #define SHIFT17(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT18(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT19(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT20(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT21(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT22(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT23(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT24(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 #define SHIFT25(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT26(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT27(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT28(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT29(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT30(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT31(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT32(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 /*
 * this function is used by 32-bit multiaccel matchers. 32-bit matchers accept
 * a 32-bit integer as a buffer, where low 16 bits is movemask result and
 * high 16 bits are "don't care" values. this function is not expected to return
 * a result higher than 16.
 */
 static really_inline
 const u8 *match32(const u8 *buf, const u32 z) {
    if (unlikely(z != 0)) {
        u32 pos = ctz32(z);
        assert(pos < 16);
        return buf + pos;
    }
    return NULL;
 }
 /*
 * this function is used by 64-bit multiaccel matchers. 64-bit matchers accept
 * a 64-bit integer as a buffer, where low 32 bits is movemask result and
 * high 32 bits are "don't care" values. this function is not expected to return
 * a result higher than 32.
 */
 static really_inline
 const u8 *match64(const u8 *buf, const u64a z) {
    if (unlikely(z != 0)) {
        u32 pos = ctz64(z);
        assert(pos < 32);
        return buf + pos;
    }
    return NULL;
 }
 #endif /* MULTIACCEL_COMMON_H_ */
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@ -0,0 +1,439 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "multiaccel_compilehelper.h"
 using namespace std;
 using namespace ue2;
 #ifdef DEBUG
 static const char* state_to_str[] = {
    "FIRST_RUN",
    "SECOND_RUN",
    "WAITING_FOR_GRAB",
    "FIRST_TAIL",
    "SECOND_TAIL",
    "STOPPED",
    "INVALID"
 };
 static const char* type_to_str[] = {
    "SHIFT",
    "SHIFTGRAB",
    "DOUBLESHIFT",
    "DOUBLESHIFTGRAB",
    "LONG",
    "LONGGRAB",
    "NONE"
 };
 static
 void dumpMultiaccelState(const accel_data &d) {
    DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
                 type_to_str[(unsigned) d.type],
                 state_to_str[(unsigned) d.state],
                 d.len1, d.tlen1, d.len2, d.tlen2);
 }
 #endif
 /* stop all the matching. this may render most schemes invalid. */
 static
 void stop(accel_data &d) {
    switch (d.state) {
    case STATE_STOPPED:
    case STATE_INVALID:
        break;
    case STATE_FIRST_TAIL:
    case STATE_SECOND_RUN:
        /*
         * Shift matchers are special case, because they have "tails".
         * When shift matcher reaches a mid/endpoint, tail mode is
         * activated, which looks for more matches to extend the match.
         *
         * For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
         * long-grab matcher will be picked for this pattern (matching a run of a's,
         * followed by a not-a), because doubleshift matcher would be confused by
         * consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
         * by 1) and throw out the rest of the pattern.
         *
         * With tails, we defer ending the run until we actually run out of
         * matching characters, so the above pattern will now be parsed by
         * doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
         *
         * So if we are stopping shift matchers, we should check if we aren't in
         * the process of matching first tail or second run. If we are, we can't
         * finish the second run as we are stopping, but we can try and split
         * the first tail instead to obtain a valid second run.
         */
        if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
            // can't split an empty void...
            d.state = STATE_INVALID;
            break;
        }
        d.len2 = 0;
        d.state = STATE_STOPPED;
        break;
    case STATE_SECOND_TAIL:
        d.state = STATE_STOPPED;
        break;
    case STATE_WAITING_FOR_GRAB:
    case STATE_FIRST_RUN:
        if (d.type == MultibyteAccelInfo::MAT_LONG) {
            d.state = STATE_STOPPED;
        } else {
            d.state = STATE_INVALID;
        }
        break;
    }
 }
 static
 void validate(accel_data &d, unsigned max_len) {
    // try and fit in all our tails
    if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
        // case 1: everything fits in
        d.len1 += d.tlen1;
        d.len2 += d.tlen2;
        d.tlen1 = 0;
        d.tlen2 = 0;
    } else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
        // case 2: everything but the second tail fits in
        d.len1 += d.tlen1;
        d.tlen1 = 0;
        // try going for a partial tail
        if (d.tlen2 != 0) {
            int new_tlen2 = max_len - 1 - d.len1 - d.len2;
            if (new_tlen2 > 0) {
                d.len2 += new_tlen2;
            }
            d.tlen2 = 0;
        }
    } else if (d.len1 + d.tlen1 < max_len) {
        // case 3: first run and its tail fits in
        if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
            // split the tail into a second run
            d.len2 = d.tlen1;
        } else {
            d.len1 += d.tlen1;
            d.len2 = 0;
        }
        d.tlen1 = 0;
        d.tlen2 = 0;
    } else if (d.len1 < max_len) {
        // case 4: nothing but the first run fits in
        // try going for a partial tail
        if (d.tlen1 != 0) {
            int new_tlen1 = max_len - 1 - d.len1;
            if (new_tlen1 > 0) {
                d.len1 += new_tlen1;
            }
            d.tlen1 = 0;
        }
        d.len2 = 0;
        d.tlen2 = 0;
    }
    // if we removed our second run, doubleshift matchers are no longer valid
    if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
        d.state = STATE_INVALID;
    } else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
        // long matchers can just stop whenever they want to
        d.len1 = max_len - 1;
    }
    // now, general sanity checks
    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
        d.state = STATE_INVALID;
    }
    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
        d.state = STATE_INVALID;
    }
 }
 static
 void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
    switch (d.type) {
    case MultibyteAccelInfo::MAT_LONG:
        {
            /*
             * For long matcher, we want lots of consecutive same-or-subset
             * char-reaches
             */
            if ((ref_cr & cur_cr) == cur_cr) {
                d.len1++;
            } else {
                d.state = STATE_STOPPED;
            }
        }
        break;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        {
            /*
             * For long-grab matcher, we want lots of consecutive same-or-subset
             * char-reaches with a negative match in the end.
             */
            if ((ref_cr & cur_cr) == cur_cr) {
                d.len1++;
            } else if (!(ref_cr & cur_cr).any()) {
                /* we grabbed, stop immediately */
                d.state = STATE_STOPPED;
            } else {
                /* our run-n-grab was interrupted; mark as invalid */
                d.state = STATE_INVALID;
            }
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        {
            /*
             * For shift-grab matcher, we want two matches separated by anything;
             * however the second vertex *must* be a negative (non-overlapping) match.
             *
             * Shiftgrab matcher is identical to shift except for presence of grab.
             */
            if (d.state == STATE_WAITING_FOR_GRAB) {
                if ((ref_cr & cur_cr).any()) {
                    d.state = STATE_INVALID;
                } else {
                    d.state = STATE_FIRST_RUN;
                    d.len1++;
                }
                return;
            }
        }
        /* no break, falling through */
    case MultibyteAccelInfo::MAT_SHIFT:
        {
            /*
             * For shift-matcher, we want two matches separated by anything.
             */
            if (ref_cr == cur_cr) {
                // keep matching tail
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.state = STATE_FIRST_TAIL;
                    break;
                case STATE_FIRST_TAIL:
                    d.tlen1++;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            } else {
                switch (d.state) {
                case STATE_FIRST_RUN:
                    // simply advance
                    d.len1++;
                    break;
                case STATE_FIRST_TAIL:
                    // we found a non-matching char after tail, so stop
                    d.state = STATE_STOPPED;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            }
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        {
            /*
             * For double shift-grab matcher, we want two matches separated by
             * either negative matches or dots; however the second vertex *must*
             * be a negative match.
             *
             * Doubleshiftgrab matcher is identical to doubleshift except for
             * presence of grab.
             */
            if (d.state == STATE_WAITING_FOR_GRAB) {
                if ((ref_cr & cur_cr).any()) {
                    d.state = STATE_INVALID;
                } else {
                    d.state = STATE_FIRST_RUN;
                    d.len1++;
                }
                return;
            }
        }
        /* no break, falling through */
    case MultibyteAccelInfo::MAT_DSHIFT:
        {
            /*
             * For double shift matcher, we want three matches, each separated
             * by a lot of anything.
             *
             * Doubleshift matcher is complicated by presence of tails.
             */
            if (ref_cr == cur_cr) {
                // decide if we are activating second shift or matching tails
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.state = STATE_FIRST_TAIL;
                    d.len2 = 1; // we're now ready for our second run
                    break;
                case STATE_FIRST_TAIL:
                    d.tlen1++;
                    break;
                case STATE_SECOND_RUN:
                    d.state = STATE_SECOND_TAIL;
                    break;
                case STATE_SECOND_TAIL:
                    d.tlen2++;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            } else {
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.len1++;
                    break;
                case STATE_FIRST_TAIL:
                    // start second run
                    d.state = STATE_SECOND_RUN;
                    d.len2++;
                    break;
                case STATE_SECOND_RUN:
                    d.len2++;
                    break;
                case STATE_SECOND_TAIL:
                    // stop
                    d.state = STATE_STOPPED;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            }
        }
        break;
    default:
        // shouldn't happen
        assert(0);
        break;
    }
 }
 MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
                                                 unsigned max_len) :
        cr(ref_cr), offset(off), max_len(max_len) {
    int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
    accels.resize(accel_num);
    // mark everything as valid
    for (int i = 0; i < accel_num; i++) {
        accel_data &ad = accels[i];
        ad.len1 = 1;
        ad.type = (MultibyteAccelInfo::multiaccel_type) i;
        /* for shift-grab matchers, we are waiting for the grab right at the start */
        if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
                || ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
            ad.state = STATE_WAITING_FOR_GRAB;
        } else {
            ad.state = STATE_FIRST_RUN;
        }
    }
 }
 bool MultiaccelCompileHelper::canAdvance() {
    for (const accel_data &ad : accels) {
        if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
            return true;
        }
    }
    return false;
 }
 void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
    for (accel_data &ad : accels) {
        if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
            continue;
        }
        match(ad, cr, cur_cr);
 #ifdef DEBUG
        dumpMultiaccelState(ad);
 #endif
    }
 }
 MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
    int best_len = 0;
    accel_data best;
    DEBUG_PRINTF("Stopping multiaccel compile\n");
    for (accel_data &ad : accels) {
        // stop our matching
        stop(ad);
        validate(ad, max_len);
 #ifdef DEBUG
        dumpMultiaccelState(ad);
 #endif
        // skip invalid schemes
        if (ad.state == STATE_INVALID) {
            continue;
        }
        DEBUG_PRINTF("Marking as viable\n");
        // TODO: relative strengths of accel schemes? maybe e.g. a shorter
        // long match would in some cases be preferable to a longer
        // double shift match (for example, depending on length)?
        int as_len = ad.len1 + ad.len2;
        if (as_len >= best_len) {
            DEBUG_PRINTF("Marking as best\n");
            best_len = as_len;
            best = ad;
        }
    }
    // if we found at least one accel scheme, return it
    if (best.state != STATE_INVALID) {
 #ifdef DEBUG
        DEBUG_PRINTF("Picked best multiaccel state:\n");
        dumpMultiaccelState(best);
 #endif
        MultibyteAccelInfo info;
        info.cr = cr;
        info.offset = offset;
        info.len1 = best.len1;
        info.len2 = best.len2;
        info.type = best.type;
        return info;
    }
    return MultibyteAccelInfo();
 }
--- a/src/nfa/multiaccel_compilehelper.h
+++ b/src/nfa/multiaccel_compilehelper.h
@ -26,44 +26,50 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
-#include "internal_report.h"
+#ifndef MULTIACCELCOMPILE_H_
-#include "report.h"
+#define MULTIACCELCOMPILE_H_
-#include "report_manager.h"
+
 #include "ue2common.h"
 #include "nfagraph/ng_limex_accel.h"
 #include <vector>
 namespace ue2 {
-void writeInternalReport(const Report &report, const ReportManager &rm,
+/* accel scheme state machine */
-                         internal_report *ir) {
+enum accel_scheme_state {
-    assert(ir);
+    STATE_FIRST_RUN,
-    assert(ISALIGNED(ir));
+    STATE_SECOND_RUN,
    STATE_WAITING_FOR_GRAB,
    STATE_FIRST_TAIL,
    STATE_SECOND_TAIL,
    STATE_STOPPED,
    STATE_INVALID
 };
-    ir->type = report.type;
+struct accel_data {
-    ir->hasBounds = report.hasBounds() ? 1 : 0;
+    MultibyteAccelInfo::multiaccel_type type = MultibyteAccelInfo::MAT_NONE;
-    ir->quashSom = report.quashSom ? 1 : 0;
+    accel_scheme_state state = STATE_INVALID;
-    ir->minOffset = report.minOffset;
+    unsigned len1 = 0; /* length of first run */
-    ir->maxOffset = report.maxOffset;
+    unsigned len2 = 0; /* length of second run, if present */
-    ir->minLength = report.minLength;
+    unsigned tlen1 = 0; /* first tail length */
-    ir->ekey = report.ekey;
+    unsigned tlen2 = 0; /* second tail length */
-    ir->offsetAdjust = report.offsetAdjust;
+};
    ir->onmatch = report.onmatch;
-    switch (report.type) {
+class MultiaccelCompileHelper {
-    case INTERNAL_ROSE_CHAIN:
+private:
-        ir->aux.topSquashDistance = report.topSquashDistance;
+    const CharReach &cr;
-        break;
+    u32 offset;
-    case EXTERNAL_CALLBACK_SOM_REV_NFA:
+    std::vector<accel_data> accels;
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
+    unsigned max_len;
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
+public:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
+    MultiaccelCompileHelper(const CharReach &cr, u32 off, unsigned max_len);
-        ir->aux.revNfaIndex = report.revNfaIndex;
+    bool canAdvance();
-        break;
+    MultibyteAccelInfo getBestScheme();
-    default:
+    void advance(const ue2::CharReach &cr);
-        ir->aux.somDistance = report.somDistance;
+};
        break;
    }
-    // Dedupe keys are managed by ReportManager.
+}; // namespace
    ir->dkey = rm.getDkey(report);
 }
-} // namespace ue2
+#endif /* MULTIACCELCOMPILE_H_ */
--- a/src/nfa/multiaccel_doubleshift.h
+++ b/src/nfa/multiaccel_doubleshift.h
@ -0,0 +1,149 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_DOUBLESHIFT_H_
 #define MULTIACCEL_DOUBLESHIFT_H_
 #include "multiaccel_common.h"
 #define DOUBLESHIFT_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(doubleshiftMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
        if (unlikely(z)) { \
            match_t tmp = z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, tmp, len2); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define DOUBLESHIFT_MATCH_32_DEF(n) \
        DOUBLESHIFT_MATCH(n, u32, 32)
 #define DOUBLESHIFT_MATCH_64_DEF(n) \
        DOUBLESHIFT_MATCH(n, u64a, 64)
 #define DOUBLESHIFT_MATCH_DEF(n) \
    DOUBLESHIFT_MATCH_32_DEF(n) \
    DOUBLESHIFT_MATCH_64_DEF(n)
 DOUBLESHIFT_MATCH_DEF(1)
 DOUBLESHIFT_MATCH_DEF(2)
 DOUBLESHIFT_MATCH_DEF(3)
 DOUBLESHIFT_MATCH_DEF(4)
 DOUBLESHIFT_MATCH_DEF(5)
 DOUBLESHIFT_MATCH_DEF(6)
 DOUBLESHIFT_MATCH_DEF(7)
 DOUBLESHIFT_MATCH_DEF(8)
 DOUBLESHIFT_MATCH_DEF(9)
 DOUBLESHIFT_MATCH_DEF(10)
 DOUBLESHIFT_MATCH_DEF(11)
 DOUBLESHIFT_MATCH_DEF(12)
 DOUBLESHIFT_MATCH_DEF(13)
 DOUBLESHIFT_MATCH_DEF(14)
 DOUBLESHIFT_MATCH_DEF(15)
 DOUBLESHIFT_MATCH_64_DEF(16)
 DOUBLESHIFT_MATCH_64_DEF(17)
 DOUBLESHIFT_MATCH_64_DEF(18)
 DOUBLESHIFT_MATCH_64_DEF(19)
 DOUBLESHIFT_MATCH_64_DEF(20)
 DOUBLESHIFT_MATCH_64_DEF(21)
 DOUBLESHIFT_MATCH_64_DEF(22)
 DOUBLESHIFT_MATCH_64_DEF(23)
 DOUBLESHIFT_MATCH_64_DEF(24)
 DOUBLESHIFT_MATCH_64_DEF(25)
 DOUBLESHIFT_MATCH_64_DEF(26)
 DOUBLESHIFT_MATCH_64_DEF(27)
 DOUBLESHIFT_MATCH_64_DEF(28)
 DOUBLESHIFT_MATCH_64_DEF(29)
 DOUBLESHIFT_MATCH_64_DEF(30)
 DOUBLESHIFT_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*doubleshift_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftMatch_32_1,
    &doubleshiftMatch_32_2,
    &doubleshiftMatch_32_3,
    &doubleshiftMatch_32_4,
    &doubleshiftMatch_32_5,
    &doubleshiftMatch_32_6,
    &doubleshiftMatch_32_7,
    &doubleshiftMatch_32_8,
    &doubleshiftMatch_32_9,
    &doubleshiftMatch_32_10,
    &doubleshiftMatch_32_11,
    &doubleshiftMatch_32_12,
    &doubleshiftMatch_32_13,
    &doubleshiftMatch_32_14,
    &doubleshiftMatch_32_15,
 };
 static
 const UNUSED u8 * (*doubleshift_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftMatch_64_1,
    &doubleshiftMatch_64_2,
    &doubleshiftMatch_64_3,
    &doubleshiftMatch_64_4,
    &doubleshiftMatch_64_5,
    &doubleshiftMatch_64_6,
    &doubleshiftMatch_64_7,
    &doubleshiftMatch_64_8,
    &doubleshiftMatch_64_9,
    &doubleshiftMatch_64_10,
    &doubleshiftMatch_64_11,
    &doubleshiftMatch_64_12,
    &doubleshiftMatch_64_13,
    &doubleshiftMatch_64_14,
    &doubleshiftMatch_64_15,
    &doubleshiftMatch_64_16,
    &doubleshiftMatch_64_17,
    &doubleshiftMatch_64_18,
    &doubleshiftMatch_64_19,
    &doubleshiftMatch_64_20,
    &doubleshiftMatch_64_21,
    &doubleshiftMatch_64_22,
    &doubleshiftMatch_64_23,
    &doubleshiftMatch_64_24,
    &doubleshiftMatch_64_25,
    &doubleshiftMatch_64_26,
    &doubleshiftMatch_64_27,
    &doubleshiftMatch_64_28,
    &doubleshiftMatch_64_29,
    &doubleshiftMatch_64_30,
    &doubleshiftMatch_64_31,
 };
 #endif /* MULTIACCEL_DOUBLESHIFT_H_ */
--- a/src/nfa/multiaccel_doubleshiftgrab.h
+++ b/src/nfa/multiaccel_doubleshiftgrab.h
@ -0,0 +1,152 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_DOUBLESHIFTGRAB_H_
 #define MULTIACCEL_DOUBLESHIFTGRAB_H_
 #include "multiaccel_common.h"
 #define DOUBLESHIFTGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(doubleshiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
        if (unlikely(z)) { \
            match_t neg = ~z; \
            match_t tmp = z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
            neg |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, tmp, len2); \
            VARISHIFT(neg, z, 1); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
        DOUBLESHIFTGRAB_MATCH(n, u32, 32)
 #define DOUBLESHIFTGRAB_MATCH_64_DEF(n) \
        DOUBLESHIFTGRAB_MATCH(n, u64a, 64)
 #define DOUBLESHIFTGRAB_MATCH_DEF(n) \
    DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
    DOUBLESHIFTGRAB_MATCH_64_DEF(n)
 DOUBLESHIFTGRAB_MATCH_DEF(1)
 DOUBLESHIFTGRAB_MATCH_DEF(2)
 DOUBLESHIFTGRAB_MATCH_DEF(3)
 DOUBLESHIFTGRAB_MATCH_DEF(4)
 DOUBLESHIFTGRAB_MATCH_DEF(5)
 DOUBLESHIFTGRAB_MATCH_DEF(6)
 DOUBLESHIFTGRAB_MATCH_DEF(7)
 DOUBLESHIFTGRAB_MATCH_DEF(8)
 DOUBLESHIFTGRAB_MATCH_DEF(9)
 DOUBLESHIFTGRAB_MATCH_DEF(10)
 DOUBLESHIFTGRAB_MATCH_DEF(11)
 DOUBLESHIFTGRAB_MATCH_DEF(12)
 DOUBLESHIFTGRAB_MATCH_DEF(13)
 DOUBLESHIFTGRAB_MATCH_DEF(14)
 DOUBLESHIFTGRAB_MATCH_DEF(15)
 DOUBLESHIFTGRAB_MATCH_64_DEF(16)
 DOUBLESHIFTGRAB_MATCH_64_DEF(17)
 DOUBLESHIFTGRAB_MATCH_64_DEF(18)
 DOUBLESHIFTGRAB_MATCH_64_DEF(19)
 DOUBLESHIFTGRAB_MATCH_64_DEF(20)
 DOUBLESHIFTGRAB_MATCH_64_DEF(21)
 DOUBLESHIFTGRAB_MATCH_64_DEF(22)
 DOUBLESHIFTGRAB_MATCH_64_DEF(23)
 DOUBLESHIFTGRAB_MATCH_64_DEF(24)
 DOUBLESHIFTGRAB_MATCH_64_DEF(25)
 DOUBLESHIFTGRAB_MATCH_64_DEF(26)
 DOUBLESHIFTGRAB_MATCH_64_DEF(27)
 DOUBLESHIFTGRAB_MATCH_64_DEF(28)
 DOUBLESHIFTGRAB_MATCH_64_DEF(29)
 DOUBLESHIFTGRAB_MATCH_64_DEF(30)
 DOUBLESHIFTGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*doubleshiftgrab_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftgrabMatch_32_1,
    &doubleshiftgrabMatch_32_2,
    &doubleshiftgrabMatch_32_3,
    &doubleshiftgrabMatch_32_4,
    &doubleshiftgrabMatch_32_5,
    &doubleshiftgrabMatch_32_6,
    &doubleshiftgrabMatch_32_7,
    &doubleshiftgrabMatch_32_8,
    &doubleshiftgrabMatch_32_9,
    &doubleshiftgrabMatch_32_10,
    &doubleshiftgrabMatch_32_11,
    &doubleshiftgrabMatch_32_12,
    &doubleshiftgrabMatch_32_13,
    &doubleshiftgrabMatch_32_14,
    &doubleshiftgrabMatch_32_15,
 };
 static
 const UNUSED u8 * (*doubleshiftgrab_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftgrabMatch_64_1,
    &doubleshiftgrabMatch_64_2,
    &doubleshiftgrabMatch_64_3,
    &doubleshiftgrabMatch_64_4,
    &doubleshiftgrabMatch_64_5,
    &doubleshiftgrabMatch_64_6,
    &doubleshiftgrabMatch_64_7,
    &doubleshiftgrabMatch_64_8,
    &doubleshiftgrabMatch_64_9,
    &doubleshiftgrabMatch_64_10,
    &doubleshiftgrabMatch_64_11,
    &doubleshiftgrabMatch_64_12,
    &doubleshiftgrabMatch_64_13,
    &doubleshiftgrabMatch_64_14,
    &doubleshiftgrabMatch_64_15,
    &doubleshiftgrabMatch_64_16,
    &doubleshiftgrabMatch_64_17,
    &doubleshiftgrabMatch_64_18,
    &doubleshiftgrabMatch_64_19,
    &doubleshiftgrabMatch_64_20,
    &doubleshiftgrabMatch_64_21,
    &doubleshiftgrabMatch_64_22,
    &doubleshiftgrabMatch_64_23,
    &doubleshiftgrabMatch_64_24,
    &doubleshiftgrabMatch_64_25,
    &doubleshiftgrabMatch_64_26,
    &doubleshiftgrabMatch_64_27,
    &doubleshiftgrabMatch_64_28,
    &doubleshiftgrabMatch_64_29,
    &doubleshiftgrabMatch_64_30,
    &doubleshiftgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_DOUBLESHIFTGRAB_H_ */
--- a/src/nfa/multiaccel_long.h
+++ b/src/nfa/multiaccel_long.h
@ -0,0 +1,145 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_LONG_H_
 #define MULTIACCEL_LONG_H_
 #include "multiaccel_common.h"
 #define LONG_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(longMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
        if (unlikely(z)) { \
            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
            JOIN(SHIFT, len)(z); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define LONG_MATCH_32_DEF(n) \
        LONG_MATCH(n, u32, 32)
 #define LONG_MATCH_64_DEF(n) \
        LONG_MATCH(n, u64a, 64)
 #define LONG_MATCH_DEF(n) \
    LONG_MATCH_32_DEF(n) \
    LONG_MATCH_64_DEF(n)
 LONG_MATCH_DEF(1)
 LONG_MATCH_DEF(2)
 LONG_MATCH_DEF(3)
 LONG_MATCH_DEF(4)
 LONG_MATCH_DEF(5)
 LONG_MATCH_DEF(6)
 LONG_MATCH_DEF(7)
 LONG_MATCH_DEF(8)
 LONG_MATCH_DEF(9)
 LONG_MATCH_DEF(10)
 LONG_MATCH_DEF(11)
 LONG_MATCH_DEF(12)
 LONG_MATCH_DEF(13)
 LONG_MATCH_DEF(14)
 LONG_MATCH_DEF(15)
 LONG_MATCH_64_DEF(16)
 LONG_MATCH_64_DEF(17)
 LONG_MATCH_64_DEF(18)
 LONG_MATCH_64_DEF(19)
 LONG_MATCH_64_DEF(20)
 LONG_MATCH_64_DEF(21)
 LONG_MATCH_64_DEF(22)
 LONG_MATCH_64_DEF(23)
 LONG_MATCH_64_DEF(24)
 LONG_MATCH_64_DEF(25)
 LONG_MATCH_64_DEF(26)
 LONG_MATCH_64_DEF(27)
 LONG_MATCH_64_DEF(28)
 LONG_MATCH_64_DEF(29)
 LONG_MATCH_64_DEF(30)
 LONG_MATCH_64_DEF(31)
 static
 const UNUSED u8 *(*long_match_funcs_32[])(const u8 *buf, u32 z) =
 {
    // skip the first three
     0,
     &longMatch_32_1,
     &longMatch_32_2,
     &longMatch_32_3,
     &longMatch_32_4,
     &longMatch_32_5,
     &longMatch_32_6,
     &longMatch_32_7,
     &longMatch_32_8,
     &longMatch_32_9,
     &longMatch_32_10,
     &longMatch_32_11,
     &longMatch_32_12,
     &longMatch_32_13,
     &longMatch_32_14,
     &longMatch_32_15,
 };
 static
 const UNUSED u8 *(*long_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first three
    0,
    &longMatch_64_1,
    &longMatch_64_2,
    &longMatch_64_3,
    &longMatch_64_4,
    &longMatch_64_5,
    &longMatch_64_6,
    &longMatch_64_7,
    &longMatch_64_8,
    &longMatch_64_9,
    &longMatch_64_10,
    &longMatch_64_11,
    &longMatch_64_12,
    &longMatch_64_13,
    &longMatch_64_14,
    &longMatch_64_15,
    &longMatch_64_16,
    &longMatch_64_17,
    &longMatch_64_18,
    &longMatch_64_19,
    &longMatch_64_20,
    &longMatch_64_21,
    &longMatch_64_22,
    &longMatch_64_23,
    &longMatch_64_24,
    &longMatch_64_25,
    &longMatch_64_26,
    &longMatch_64_27,
    &longMatch_64_28,
    &longMatch_64_29,
    &longMatch_64_30,
    &longMatch_64_31,
 };
 #endif /* MULTIACCEL_LONG_H_ */
--- a/src/nfa/multiaccel_longgrab.h
+++ b/src/nfa/multiaccel_longgrab.h
@ -0,0 +1,148 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_LONGGRAB_H_
 #define MULTIACCEL_LONGGRAB_H_
 #include "multiaccel_common.h"
 #define LONGGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(longgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
        if (unlikely(z)) { \
            match_t tmp = ~z; \
            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
            JOIN(SHIFT, len)(z); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define LONGGRAB_MATCH_32_DEF(n) \
        LONGGRAB_MATCH(n, u32, 32)
 #define LONGGRAB_MATCH_64_DEF(n) \
        LONGGRAB_MATCH(n, u64a, 64)
 #define LONGGRAB_MATCH_DEF(n) \
    LONGGRAB_MATCH_32_DEF(n) \
    LONGGRAB_MATCH_64_DEF(n)
 LONGGRAB_MATCH_DEF(1)
 LONGGRAB_MATCH_DEF(2)
 LONGGRAB_MATCH_DEF(3)
 LONGGRAB_MATCH_DEF(4)
 LONGGRAB_MATCH_DEF(5)
 LONGGRAB_MATCH_DEF(6)
 LONGGRAB_MATCH_DEF(7)
 LONGGRAB_MATCH_DEF(8)
 LONGGRAB_MATCH_DEF(9)
 LONGGRAB_MATCH_DEF(10)
 LONGGRAB_MATCH_DEF(11)
 LONGGRAB_MATCH_DEF(12)
 LONGGRAB_MATCH_DEF(13)
 LONGGRAB_MATCH_DEF(14)
 LONGGRAB_MATCH_DEF(15)
 LONGGRAB_MATCH_64_DEF(16)
 LONGGRAB_MATCH_64_DEF(17)
 LONGGRAB_MATCH_64_DEF(18)
 LONGGRAB_MATCH_64_DEF(19)
 LONGGRAB_MATCH_64_DEF(20)
 LONGGRAB_MATCH_64_DEF(21)
 LONGGRAB_MATCH_64_DEF(22)
 LONGGRAB_MATCH_64_DEF(23)
 LONGGRAB_MATCH_64_DEF(24)
 LONGGRAB_MATCH_64_DEF(25)
 LONGGRAB_MATCH_64_DEF(26)
 LONGGRAB_MATCH_64_DEF(27)
 LONGGRAB_MATCH_64_DEF(28)
 LONGGRAB_MATCH_64_DEF(29)
 LONGGRAB_MATCH_64_DEF(30)
 LONGGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 *(*longgrab_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first three
     0,
     &longgrabMatch_32_1,
     &longgrabMatch_32_2,
     &longgrabMatch_32_3,
     &longgrabMatch_32_4,
     &longgrabMatch_32_5,
     &longgrabMatch_32_6,
     &longgrabMatch_32_7,
     &longgrabMatch_32_8,
     &longgrabMatch_32_9,
     &longgrabMatch_32_10,
     &longgrabMatch_32_11,
     &longgrabMatch_32_12,
     &longgrabMatch_32_13,
     &longgrabMatch_32_14,
     &longgrabMatch_32_15,
 };
 static
 const UNUSED u8 *(*longgrab_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first three
    0,
    &longgrabMatch_64_1,
    &longgrabMatch_64_2,
    &longgrabMatch_64_3,
    &longgrabMatch_64_4,
    &longgrabMatch_64_5,
    &longgrabMatch_64_6,
    &longgrabMatch_64_7,
    &longgrabMatch_64_8,
    &longgrabMatch_64_9,
    &longgrabMatch_64_10,
    &longgrabMatch_64_11,
    &longgrabMatch_64_12,
    &longgrabMatch_64_13,
    &longgrabMatch_64_14,
    &longgrabMatch_64_15,
    &longgrabMatch_64_16,
    &longgrabMatch_64_17,
    &longgrabMatch_64_18,
    &longgrabMatch_64_19,
    &longgrabMatch_64_20,
    &longgrabMatch_64_21,
    &longgrabMatch_64_22,
    &longgrabMatch_64_23,
    &longgrabMatch_64_24,
    &longgrabMatch_64_25,
    &longgrabMatch_64_26,
    &longgrabMatch_64_27,
    &longgrabMatch_64_28,
    &longgrabMatch_64_29,
    &longgrabMatch_64_30,
    &longgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_LONGGRAB_H_ */
--- a/src/nfa/multiaccel_shift.h
+++ b/src/nfa/multiaccel_shift.h
@ -0,0 +1,145 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_SHIFT_H_
 #define MULTIACCEL_SHIFT_H_
 #include "multiaccel_common.h"
 #define SHIFT_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(shiftMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
        if (unlikely(z)) { \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define SHIFT_MATCH_32_DEF(n) \
        SHIFT_MATCH(n, u32, 32)
 #define SHIFT_MATCH_64_DEF(n) \
        SHIFT_MATCH(n, u64a, 64)
 #define SHIFT_MATCH_DEF(n) \
    SHIFT_MATCH_32_DEF(n) \
    SHIFT_MATCH_64_DEF(n)
 SHIFT_MATCH_DEF(1)
 SHIFT_MATCH_DEF(2)
 SHIFT_MATCH_DEF(3)
 SHIFT_MATCH_DEF(4)
 SHIFT_MATCH_DEF(5)
 SHIFT_MATCH_DEF(6)
 SHIFT_MATCH_DEF(7)
 SHIFT_MATCH_DEF(8)
 SHIFT_MATCH_DEF(9)
 SHIFT_MATCH_DEF(10)
 SHIFT_MATCH_DEF(11)
 SHIFT_MATCH_DEF(12)
 SHIFT_MATCH_DEF(13)
 SHIFT_MATCH_DEF(14)
 SHIFT_MATCH_DEF(15)
 SHIFT_MATCH_64_DEF(16)
 SHIFT_MATCH_64_DEF(17)
 SHIFT_MATCH_64_DEF(18)
 SHIFT_MATCH_64_DEF(19)
 SHIFT_MATCH_64_DEF(20)
 SHIFT_MATCH_64_DEF(21)
 SHIFT_MATCH_64_DEF(22)
 SHIFT_MATCH_64_DEF(23)
 SHIFT_MATCH_64_DEF(24)
 SHIFT_MATCH_64_DEF(25)
 SHIFT_MATCH_64_DEF(26)
 SHIFT_MATCH_64_DEF(27)
 SHIFT_MATCH_64_DEF(28)
 SHIFT_MATCH_64_DEF(29)
 SHIFT_MATCH_64_DEF(30)
 SHIFT_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*shift_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first
   0,
   &shiftMatch_32_1,
   &shiftMatch_32_2,
   &shiftMatch_32_3,
   &shiftMatch_32_4,
   &shiftMatch_32_5,
   &shiftMatch_32_6,
   &shiftMatch_32_7,
   &shiftMatch_32_8,
   &shiftMatch_32_9,
   &shiftMatch_32_10,
   &shiftMatch_32_11,
   &shiftMatch_32_12,
   &shiftMatch_32_13,
   &shiftMatch_32_14,
   &shiftMatch_32_15,
 };
 static
 const UNUSED u8 * (*shift_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first
    0,
    &shiftMatch_64_1,
    &shiftMatch_64_2,
    &shiftMatch_64_3,
    &shiftMatch_64_4,
    &shiftMatch_64_5,
    &shiftMatch_64_6,
    &shiftMatch_64_7,
    &shiftMatch_64_8,
    &shiftMatch_64_9,
    &shiftMatch_64_10,
    &shiftMatch_64_11,
    &shiftMatch_64_12,
    &shiftMatch_64_13,
    &shiftMatch_64_14,
    &shiftMatch_64_15,
    &shiftMatch_64_16,
    &shiftMatch_64_17,
    &shiftMatch_64_18,
    &shiftMatch_64_19,
    &shiftMatch_64_20,
    &shiftMatch_64_21,
    &shiftMatch_64_22,
    &shiftMatch_64_23,
    &shiftMatch_64_24,
    &shiftMatch_64_25,
    &shiftMatch_64_26,
    &shiftMatch_64_27,
    &shiftMatch_64_28,
    &shiftMatch_64_29,
    &shiftMatch_64_30,
    &shiftMatch_64_31,
 };
 #endif /* MULTIACCEL_SHIFT_H_ */
--- a/src/nfa/multiaccel_shiftgrab.h
+++ b/src/nfa/multiaccel_shiftgrab.h
@ -0,0 +1,148 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_SHIFTGRAB_H_
 #define MULTIACCEL_SHIFTGRAB_H_
 #include "multiaccel_common.h"
 #define SHIFTGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(shiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
        if (unlikely(z)) { \
            match_t tmp = ~z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, z, 1); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define SHIFTGRAB_MATCH_32_DEF(n) \
        SHIFTGRAB_MATCH(n, u32, 32)
 #define SHIFTGRAB_MATCH_64_DEF(n) \
        SHIFTGRAB_MATCH(n, u64a, 64)
 #define SHIFTGRAB_MATCH_DEF(n) \
    SHIFTGRAB_MATCH_32_DEF(n) \
    SHIFTGRAB_MATCH_64_DEF(n)
 SHIFTGRAB_MATCH_DEF(1)
 SHIFTGRAB_MATCH_DEF(2)
 SHIFTGRAB_MATCH_DEF(3)
 SHIFTGRAB_MATCH_DEF(4)
 SHIFTGRAB_MATCH_DEF(5)
 SHIFTGRAB_MATCH_DEF(6)
 SHIFTGRAB_MATCH_DEF(7)
 SHIFTGRAB_MATCH_DEF(8)
 SHIFTGRAB_MATCH_DEF(9)
 SHIFTGRAB_MATCH_DEF(10)
 SHIFTGRAB_MATCH_DEF(11)
 SHIFTGRAB_MATCH_DEF(12)
 SHIFTGRAB_MATCH_DEF(13)
 SHIFTGRAB_MATCH_DEF(14)
 SHIFTGRAB_MATCH_DEF(15)
 SHIFTGRAB_MATCH_64_DEF(16)
 SHIFTGRAB_MATCH_64_DEF(17)
 SHIFTGRAB_MATCH_64_DEF(18)
 SHIFTGRAB_MATCH_64_DEF(19)
 SHIFTGRAB_MATCH_64_DEF(20)
 SHIFTGRAB_MATCH_64_DEF(21)
 SHIFTGRAB_MATCH_64_DEF(22)
 SHIFTGRAB_MATCH_64_DEF(23)
 SHIFTGRAB_MATCH_64_DEF(24)
 SHIFTGRAB_MATCH_64_DEF(25)
 SHIFTGRAB_MATCH_64_DEF(26)
 SHIFTGRAB_MATCH_64_DEF(27)
 SHIFTGRAB_MATCH_64_DEF(28)
 SHIFTGRAB_MATCH_64_DEF(29)
 SHIFTGRAB_MATCH_64_DEF(30)
 SHIFTGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*shiftgrab_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first
    0,
    &shiftgrabMatch_32_1,
    &shiftgrabMatch_32_2,
    &shiftgrabMatch_32_3,
    &shiftgrabMatch_32_4,
    &shiftgrabMatch_32_5,
    &shiftgrabMatch_32_6,
    &shiftgrabMatch_32_7,
    &shiftgrabMatch_32_8,
    &shiftgrabMatch_32_9,
    &shiftgrabMatch_32_10,
    &shiftgrabMatch_32_11,
    &shiftgrabMatch_32_12,
    &shiftgrabMatch_32_13,
    &shiftgrabMatch_32_14,
    &shiftgrabMatch_32_15,
 };
 static
 const UNUSED u8 * (*shiftgrab_match_funcs_64[])(const u8 *buf, u64a z) =
                                                       {
 // skip the first
    0,
    &shiftgrabMatch_64_1,
    &shiftgrabMatch_64_2,
    &shiftgrabMatch_64_3,
    &shiftgrabMatch_64_4,
    &shiftgrabMatch_64_5,
    &shiftgrabMatch_64_6,
    &shiftgrabMatch_64_7,
    &shiftgrabMatch_64_8,
    &shiftgrabMatch_64_9,
    &shiftgrabMatch_64_10,
    &shiftgrabMatch_64_11,
    &shiftgrabMatch_64_12,
    &shiftgrabMatch_64_13,
    &shiftgrabMatch_64_14,
    &shiftgrabMatch_64_15,
    &shiftgrabMatch_64_16,
    &shiftgrabMatch_64_17,
    &shiftgrabMatch_64_18,
    &shiftgrabMatch_64_19,
    &shiftgrabMatch_64_20,
    &shiftgrabMatch_64_21,
    &shiftgrabMatch_64_22,
    &shiftgrabMatch_64_23,
    &shiftgrabMatch_64_24,
    &shiftgrabMatch_64_25,
    &shiftgrabMatch_64_26,
    &shiftgrabMatch_64_27,
    &shiftgrabMatch_64_28,
    &shiftgrabMatch_64_29,
    &shiftgrabMatch_64_30,
    &shiftgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_SHIFTGRAB_H_ */
--- a/src/nfa/multishufti.c
+++ b/src/nfa/multishufti.c
@ -0,0 +1,114 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Shufti: character class acceleration.
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #include "config.h"
 #include "ue2common.h"
 #include "multishufti.h"
 #include "multiaccel_common.h"
 #if !defined(__AVX2__)
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #else
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #endif
--- a/src/nfa/multishufti.h
+++ b/src/nfa/multishufti.h
@ -26,46 +26,42 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
-#ifndef SIDECAR_H
+/** \file
-#define SIDECAR_H
+ * \brief Multishufti: multibyte version of Shufti
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #ifndef MULTISHUFTI_H
 #define MULTISHUFTI_H
 #include "ue2common.h"
 #include "util/simd_utils.h"
 #ifdef __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
-struct sidecar;
+const u8 *long_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-struct sidecar_enabled;
+                          const u8 *buf_end, const u8 run_len);
 struct sidecar_scratch;
-/*
+const u8 *longgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- * Sidecar is guaranteed to return the first match of a given id. However, in
+                              const u8 *buf_end, const u8 run_len);
 * various cases later matches may also be returned, as may matches for disabled
 * ids
 */
 typedef void (*SidecarCallback)(u64a offset, u32 id, void *context);
-void sidecarExec(const struct sidecar *n, const u8 *buffer, size_t len,
+const u8 *shift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                 struct sidecar_enabled *enabled,
+                           const u8 *buf_end, const u8 run_len);
                 struct sidecar_scratch *sidecar_scratch,
                 u64a base_offset, SidecarCallback cb, void *context);
-u32 sidecarScratchSize(const struct sidecar *n);
+const u8 *shiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                               const u8 *buf_end, const u8 run_len);
-void sidecarEnabledInit(const struct sidecar *n,
+const u8 *doubleshift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                        struct sidecar_enabled *enabled);
+                                 const u8 *buf_end, const u8 run_len,
                                 const u8 run2_len);
-/* Note: sidecar literals need to be reenabled after they match.
+const u8 *doubleshiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- * This is purely because this behaviour is handy for rose.
+                                     const u8 *buf_end, const u8 run_len,
- * In rose, they always set their roles when fired (never have to postpone due
+                                     const u8 run2_len);
 * to history) and if cleared their preds are also cleared so a pred would also
 * have to match again before we need to care about them again
 */
 void sidecarEnabledUnion(const struct sidecar *n, struct sidecar_enabled *dest,
                         const struct sidecar_enabled *src);
 #define ID_TERMINATOR (~0U)
 #ifdef __cplusplus
 }
--- a/src/nfa/multishufti_avx2.h
+++ b/src/nfa/multishufti_avx2.h
@ -0,0 +1,122 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "shufti_common.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
                                     const u8 *buf, const m256 low4bits,
                                     const m256 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                     ) {
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z
 #ifdef MULTIACCEL_DOUBLE
                                                             , run_len2
 #endif
                                                             );
 }
 const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
                                       const u8 *buf,
                                       const u8 *buf_end, u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                       , u8 run_len2
 #endif
                                       ) {
    assert(buf && buf_end);
    assert(buf < buf_end);
    // Slow path for small cases.
    if (buf_end - buf < 32) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }
    const m256 zeroes = zeroes256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask_lo = set2x128(mask_lo);
    const m256 wide_mask_hi = set2x128(mask_hi);
    const u8 *rv;
    size_t min = (size_t)buf % 32;
    assert(buf_end - buf >= 32);
    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf,
                                            low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                            , run_len2
 #endif
                                            );
    if (rv) {
        return rv;
    }
    buf += (32 - min);
    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.
    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf,
                                        low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , run_len2
 #endif
                                        );
        if (rv) {
            return rv;
        }
        buf += 32;
    }
    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 32);
    chars = loadu256(buf_end - 32);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32,
                                    low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multishufti_sse.h
+++ b/src/nfa/multishufti_sse.h
@ -0,0 +1,266 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "shufti_common.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 /* Normal SSSE3 shufti */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 mask_lo, m128 mask_hi, m128 chars,
                                     const u8 *buf, const m128 low4bits,
                                     const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                             ) {
    // negate first 16 bits
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes) ^ 0xFFFF;
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
 }
 /*
 * 16-byte pipeline, for smaller scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, shuftiPipeline16)(m128 mask_lo, m128 mask_hi,
                                             const u8 *buf, const u8 *buf_end,
                                             const m128 low4bits,
                                             const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                             , const u8 run_len2
 #endif
                                             ) {
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 16 bytes
    m128 data = load128(buf);
    u32 z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
    last_buf = buf;
    last_res = z;
    buf += 16;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 15 < buf_end; buf += 16) {
        // scan more data
        data = load128(buf);
        z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte pipeline, for bigger scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, shuftiPipeline32)(m128 mask_lo, m128 mask_hi,
                                             const u8 *buf, const u8 *buf_end,
                                             const m128 low4bits,
                                             const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                             , const u8 run_len2
 #endif
                                             ) {
    const u8* ptr, *last_buf;
    u32 res;
    // pipeline prologue: scan first 32 bytes
    m128 data1 = load128(buf);
    u32 z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
    m128 data2 = load128(buf + 16);
    u32 z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
    // store the results
    u32 last_res = z1 | (z2 << 16);
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data1 = load128(buf);
        z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
        data2 = load128(buf + 16);
        z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
        res = z1 | (z2 << 16);
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_res = res;
        last_buf = buf;
    }
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    // if we still have some data left, scan it too
    for (; buf + 15 < buf_end; buf += 16) {
        m128 chars = load128(buf);
        ptr = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
                low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(ptr)) {
            return ptr;
        }
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    return NULL;
 }
 const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
                                       const u8 *buf,
                                       const u8 *buf_end, u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                       , u8 run_len2
 #endif
                                               ) {
    assert(buf && buf_end);
    assert(buf < buf_end);
    // Slow path for small cases.
    if (buf_end - buf < 16) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                buf, buf_end);
    }
    const m128 zeroes = zeroes128();
    const m128 low4bits = _mm_set1_epi8(0xf);
    const u8 *rv;
    size_t min = (size_t)buf % 16;
    assert(buf_end - buf >= 16);
    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
            low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (rv) {
        return rv;
    }
    buf += (16 - min);
    // if we have enough data, run bigger pipeline; otherwise run smaller one
    if (buf_end - buf >= 128) {
        rv = JOIN(MATCH_ALGO, shuftiPipeline32)(mask_lo, mask_hi,
                buf, buf_end, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(rv)) {
            return rv;
        }
    } else if (buf_end - buf >= 16){
        rv = JOIN(MATCH_ALGO, shuftiPipeline16)(mask_lo, mask_hi,
                buf, buf_end, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(rv)) {
            return rv;
        }
    }
    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf_end.
    chars = loadu128(buf_end - 16);
    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars,
            buf_end - 16, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multitruffle.c
+++ b/src/nfa/multitruffle.c
@ -0,0 +1,111 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "config.h"
 #include "ue2common.h"
 #include "multitruffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 #include "multiaccel_common.h"
 #if !defined(__AVX2__)
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #else
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #endif
--- a/src/nfa/multitruffle.h
+++ b/src/nfa/multitruffle.h
@ -0,0 +1,73 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTITRUFFLE_H
 #define MULTITRUFFLE_H
 /** \file
 * \brief Multitruffle: multibyte version of Truffle.
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #include "util/simd_types.h"
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 const u8 *long_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                           const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *longgrab_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                               const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *shift_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                            const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *shiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
                                m128 shuf_mask_lo_highset, const u8 *buf,
                                const u8 *buf_end, const u8 run_len);
 const u8 *doubleshift_truffleExec(m128 shuf_mask_lo_highclear,
                                  m128 shuf_mask_lo_highset, const u8 *buf,
                                  const u8 *buf_end, const u8 run_len,
                                  const u8 run2_len);
 const u8 *doubleshiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
                                      m128 shuf_mask_lo_highset, const u8 *buf,
                                      const u8 *buf_end, const u8 run_len,
                                      const u8 run2_len);
 #ifdef __cplusplus
 }
 #endif
 #endif /* MULTITRUFFLE_H */
--- a/Show More
+++ b/Show More
`@ -30,4 +30,4 @@ and/or other countries.`

	`\*Other names and brands may be claimed as the property of others.`	`\*Other names and brands may be claimed as the property of others.`

	`Copyright \|copy\| 2015, Intel Corporation. All rights reserved.`	`Copyright \|copy\| 2015-2016, Intel Corporation. All rights reserved.`