Merge branch develop into master

2025-06-28 16:41:01 +03:00 · 2017-06-09 10:17:04 +10:00 · 2017-06-09 10:17:04 +10:00 · a00bd3167c
commit a00bd3167c
parent 7aff6f6136 87469d4775
334 changed files with 21513 additions and 22029 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,9 +2,52 @@

 This is a list of notable changes to Hyperscan, in reverse chronological order.

+## [4.5.0] 2017-06-09
+- New API feature: approximate matching using the "edit distance" extended
+  parameter. This allows the user to request all matches that are a given edit
+  distance from an exact match for a pattern.
+- Initial support for Intel(R) Advanced Vector Extensions 512 (Intel(R)
+  AVX-512), disabled by default. To enable it, pass `-DBUILD_AVX512=1` to
+  `cmake`.
+- Major compile time improvements in many subsystems, reducing compile time
+  significantly for many large pattern sets.
+- Internal reworking of literal matchers to operate on literals of at
+  most eight characters, with subsequent confirmation done in the Rose
+  interpreter. This reduces complexity and bytecode size and improves
+  performance for many pattern sets.
+- Improve performance of the FDR literal matcher front end.
+- Improve bucket assignment and other heuristics governing the FDR literal
+  matcher.
+- Improve optimisation passes that take advantage of extended parameter
+  constraints (`min_offset`, etc).
+- Introduce further lookaround specialisations to improve scanning performance.
+- Optimise Rose interpreter construction to reduce the length of programs
+  generated in some situations.
+- Remove the old "Rose" pattern decomposition analysis pass in favour of the
+  new "Violet" pass introduced in Hyperscan 4.3.0.
+- In streaming mode, allow exhaustion (where the stream can no longer produce
+  matchers) to be detected in more situations, improving scanning performance.
+- Improve parsing of control verbs (such as `(*UTF8)`) that can only occur at
+  the beginning of the pattern. Combinations of supported verbs in any order
+  are now permitted.
+- Update version of PCRE used by testing tools as a syntax and semantic
+  reference to PCRE 8.40.
+- Tuning support for Intel(R) microarchitecture code names Skylake, Skylake
+  Server, Goldmont.
+- CMake: when building a native build with a version of GCC that doesn't
+  recognise the host compiler, tune for the microarch selected by
+  `-march=native`.
+- CMake: don't fail if SQLite (which is only required to build the `hsbench`
+  tool) is not present.
+- CMake: detect libc++ directly and use that to inform the Boost version
+  requirement.
+- Bugfix for issue #51: make the fat runtime build wrapper less fragile.
+- Bugfix for issues #46, #52: use `sqlite3_errmsg()` to allow SQLite 3.6.x to
+  be used. Thanks to @EaseTheWorld for the PR.
+
 ## [4.4.1] 2017-02-28
 - Bugfixes to fix issues where stale data was being referenced in scratch
-  memory. In particular this may have resulted in hs_close_stream()
+  memory. In particular this may have resulted in `hs_close_stream()`
  referencing data from other previously scanned streams. This may result in
  incorrect matches being been reported.

@ -142,9 +185,7 @@ This is a list of notable changes to Hyperscan, in reverse chronological order.
  supplied with a NULL scratch pointer if no matches are required. This is in
  line with the behaviour of `hs_close_stream()`.
 - Disallow bounded repeats with a very large minimum repeat but no maximum,
-  i.e. {
-    N,
-} for very large N.
+  i.e. {N,} for very large N.
 - Reduce compile memory usage in literal set explansion for some large cases.

 ## [4.0.0] 2015-10-20
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,20 +1,22 @@
 cmake_minimum_required (VERSION 2.8.11)
-project (Hyperscan C CXX)
+project (hyperscan C CXX)

 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 4)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 5)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
+include(CheckCXXSymbolExists)
 INCLUDE (CheckFunctionExists)
 INCLUDE (CheckIncludeFiles)
 INCLUDE (CheckIncludeFileCXX)
 INCLUDE (CheckLibraryExists)
 INCLUDE (CheckSymbolExists)
 include (CMakeDependentOption)
+include (GNUInstallDirs)
 include (${CMAKE_MODULE_PATH}/platform.cmake)
 include (${CMAKE_MODULE_PATH}/ragel.cmake)

@ -36,6 +38,7 @@ endif()

 set(BINDIR "${PROJECT_BINARY_DIR}/bin")
 set(LIBDIR "${PROJECT_BINARY_DIR}/lib")
+set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})

 # First for the generic no-config case
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${BINDIR}")
@ -59,31 +62,6 @@ include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)

-set(BOOST_USE_STATIC_LIBS OFF)
-set(BOOST_USE_MULTITHREADED OFF)
-set(BOOST_USE_STATIC_RUNTIME OFF)
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin"
-    OR (CMAKE_SYSTEM_NAME MATCHES "FreeBSD"
-        AND CMAKE_C_COMPILER_ID MATCHES "Clang"))
-    # we need a more recent boost for libc++ used by clang on OSX and FreeBSD
-    set(BOOST_MINVERSION 1.61.0)
-else ()
-    set(BOOST_MINVERSION 1.57.0)
-endif ()
-set(BOOST_NO_BOOST_CMAKE ON)
-
-# first check for Boost installed on the system
-find_package(Boost ${BOOST_MINVERSION})
-if(NOT Boost_FOUND)
-    # we might have boost in tree, so provide a hint and try again
-    message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
-    find_package(Boost ${BOOST_MINVERSION})
-    if(NOT Boost_FOUND)
-        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
-    endif()
-endif()
-
 include (${CMAKE_MODULE_PATH}/boost.cmake)

 # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
@ -132,6 +110,12 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
    endif()
 endif()

+if (NOT BUILD_SHARED_LIBS)
+    # build static libs
+    set(BUILD_STATIC_LIBS ON)
+    mark_as_advanced(BUILD_STATIC_LIBS)
+endif ()
+
 #for config
 if (OPTIMISE)
    set(HS_OPTIMIZE ON)
@ -141,6 +125,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r

 CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)

+option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
+    OFF)
+
 option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)

 # TODO: per platform config files?
@ -148,16 +135,21 @@ option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC
 # TODO: windows generator on cmake always uses msvc, even if we plan to build with icc
 if(MSVC OR MSVC_IDE)
    message(STATUS "Building for Windows")
+
    if (MSVC_VERSION LESS 1700)
        message(FATAL_ERROR "The project requires C++11 features.")
    else()
        if (WINDOWS_ICC)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
+            set(ARCH_C_FLAGS "/QxHost")
+            set(ARCH_CXX_FLAGS "/QxHost")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
        else()
-            #TODO: don't hardcode arch
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /arch:AVX /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /arch:AVX /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            # todo: change these as required
+            set(ARCH_C_FLAGS "/arch:AVX2")
+            set(ARCH_CXX_FLAGS "/arch:AVX2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /wd4244 /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
        endif()
        string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
        string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
@ -166,32 +158,58 @@ if(MSVC OR MSVC_IDE)
            set(CMAKE_C_FLAGS_DEBUG "/DNDEBUG ${CMAKE_C_FLAGS_DEBUG}")
            set(CMAKE_CXX_FLAGS_DEBUG "/DNDEBUG ${CMAKE_CXX_FLAGS_DEBUG}")
        endif ()
+
+        # flags only used to build hs libs
+        set(HS_C_FLAGS "/Gv")
+        set(HS_CXX_FLAGS "/Gv")
    endif()

 else()

-    # compiler version checks TODO: test more compilers
-    if (CMAKE_COMPILER_IS_GNUCXX)
-        set (GNUCXX_MINVER "4.8.1")
-        exec_program(${CMAKE_CXX_COMPILER}
-                     ARGS ${CMAKE_CXX_COMPILER_ARG1} --version
-                     OUTPUT_VARIABLE _GXX_OUTPUT)
-        # is the following too fragile?
-        string(REGEX REPLACE ".* ([0-9]\\.[0-9](\\.[0-9])?)( |\n).*" "\\1"
-               GNUCXX_VERSION "${_GXX_OUTPUT}")
-        message(STATUS "g++ version ${GNUCXX_VERSION}")
-        if (GNUCXX_VERSION VERSION_LESS ${GNUCXX_MINVER})
-            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
-        endif()
-        unset(_GXX_OUTPUT)
-    endif()
-
    # remove CMake's idea of optimisation
    foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
    endforeach ()

+    if (CMAKE_COMPILER_IS_GNUCC)
+        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+        # If gcc doesn't recognise the host cpu, then mtune=native becomes
+        # generic, which isn't very good in some cases. march=native looks at
+        # cpuid info and then chooses the best microarch it can (and replaces
+        # the flag), so use that for tune.
+
+        # arg1 might exist if using ccache
+        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_VARIABLE _GCC_OUTPUT)
+        string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+            GNUCC_ARCH "${_GCC_OUTPUT}")
+
+        # test the parsed flag
+        set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_QUIET ERROR_QUIET
+            INPUT_FILE /dev/null
+            RESULT_VARIABLE GNUCC_TUNE_TEST)
+        if (NOT GNUCC_TUNE_TEST EQUAL 0)
+            message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
+        endif()
+        set(TUNE_FLAG ${GNUCC_ARCH})
+    else ()
+        set(TUNE_FLAG native)
+    endif()
+
+    # compiler version checks TODO: test more compilers
+    if (CMAKE_COMPILER_IS_GNUCXX)
+        set(GNUCXX_MINVER "4.8.1")
+        message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
+        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
+            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
+        endif()
+    endif()
+
    if(OPTIMISE)
        set(OPT_C_FLAG "-O3")
        set(OPT_CXX_FLAG "-O2")
@ -216,12 +234,12 @@ else()
        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
    endif()

-    if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
-        set(ARCH_C_FLAGS "${ARCH_C_FLAGS} -march=native -mtune=native")
+    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+        set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}")
    endif()

-    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.*)
-        set(ARCH_CXX_FLAGS "${ARCH_CXX_FLAGS} -march=native -mtune=native")
+    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+        set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
    endif()

    if(CMAKE_COMPILER_IS_GNUCC)
@ -244,6 +262,11 @@ else()
        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
    endif ()

+    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
+        set(SKYLAKE_FLAG "-xCORE-AVX512")
+    else ()
+        set(SKYLAKE_FLAG "-march=skylake-avx512")
+    endif ()
 endif()

 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
@ -259,6 +282,9 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)

+# are we using libc++
+CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
+
 if (RELEASE_BUILD)
    if (HAS_C_HIDDEN)
        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
@ -294,13 +320,10 @@ endif ()

 include (${CMAKE_MODULE_PATH}/arch.cmake)

-if (NOT FAT_RUNTIME AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
-endif ()
-
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
+CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)

 if (NOT WIN32)
 set(C_FLAGS_TO_CHECK
@ -404,13 +427,13 @@ endif()
 endif()

 if (NOT FAT_RUNTIME)
-message(STATUS "Building for current host CPU")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
+    message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 else()
-message(STATUS "Building runtime for multiple microarchitectures")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    message(STATUS "Building runtime for multiple microarchitectures")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()

 add_subdirectory(util)
@ -435,19 +458,18 @@ if (NOT WIN32)

    configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
    install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
-            DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 endif()

 # only set these after all tests are done
 if (NOT FAT_RUNTIME)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${HS_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS} ${HS_CXX_FLAGS}")
 else()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 endif()

-
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
 endif()
@ -459,13 +481,20 @@ set_source_files_properties(

 ragelmaker(src/parser/Parser.rl)

+set_source_files_properties(
+    ${CMAKE_BINARY_DIR}/src/parser/control_verbs.cpp
+    PROPERTIES
+        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
+
+ragelmaker(src/parser/control_verbs.rl)
+
 SET(hs_HEADERS
    src/hs.h
    src/hs_common.h
    src/hs_compile.h
    src/hs_runtime.h
 )
-install(FILES ${hs_HEADERS} DESTINATION include/hs)
+install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")

 set (hs_exec_common_SRCS
    src/alloc.c
@ -541,25 +570,6 @@ set (hs_exec_SRCS
    src/nfa/mpv.h
    src/nfa/mpv.c
    src/nfa/mpv_internal.h
-    src/nfa/multiaccel_common.h
-    src/nfa/multiaccel_doubleshift.h
-    src/nfa/multiaccel_doubleshiftgrab.h
-    src/nfa/multiaccel_long.h
-    src/nfa/multiaccel_longgrab.h
-    src/nfa/multiaccel_shift.h
-    src/nfa/multiaccel_shiftgrab.h
-    src/nfa/multishufti.c
-    src/nfa/multishufti_avx2.h
-    src/nfa/multishufti_sse.h
-    src/nfa/multishufti.h
-    src/nfa/multitruffle.c
-    src/nfa/multitruffle_avx2.h
-    src/nfa/multitruffle_sse.h
-    src/nfa/multitruffle.h
-    src/nfa/multivermicelli.c
-    src/nfa/multivermicelli.h
-    src/nfa/multivermicelli_sse.h
-    src/nfa/multivermicelli_avx2.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_dispatch.c
    src/nfa/nfa_internal.h
@ -573,13 +583,11 @@ set (hs_exec_SRCS
    src/nfa/sheng_impl.h
    src/nfa/sheng_impl4.h
    src/nfa/sheng_internal.h
-    src/nfa/shufti_common.h
    src/nfa/shufti.c
    src/nfa/shufti.h
    src/nfa/tamarama.c
    src/nfa/tamarama.h
    src/nfa/tamarama_internal.h
-    src/nfa/truffle_common.h
    src/nfa/truffle.c
    src/nfa/truffle.h
    src/nfa/vermicelli.h
@ -662,6 +670,7 @@ SET (hs_SRCS
    src/compiler/compiler.h
    src/compiler/error.cpp
    src/compiler/error.h
+    src/compiler/expression_info.h
    src/fdr/engine_description.cpp
    src/fdr/engine_description.h
    src/fdr/fdr_compile.cpp
@ -719,8 +728,6 @@ SET (hs_SRCS
    src/nfa/mpv_internal.h
    src/nfa/mpvcompile.cpp
    src/nfa/mpvcompile.h
-    src/nfa/multiaccel_compilehelper.cpp
-    src/nfa/multiaccel_compilehelper.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_queue.h
    src/nfa/nfa_api_util.h
@ -775,6 +782,8 @@ SET (hs_SRCS
    src/nfagraph/ng_extparam.h
    src/nfagraph/ng_fixed_width.cpp
    src/nfagraph/ng_fixed_width.h
+    src/nfagraph/ng_fuzzy.cpp
+    src/nfagraph/ng_fuzzy.h
    src/nfagraph/ng_haig.cpp
    src/nfagraph/ng_haig.h
    src/nfagraph/ng_holder.cpp
@ -820,8 +829,6 @@ SET (hs_SRCS
    src/nfagraph/ng_restructuring.h
    src/nfagraph/ng_revacc.cpp
    src/nfagraph/ng_revacc.h
-    src/nfagraph/ng_rose.cpp
-    src/nfagraph/ng_rose.h
    src/nfagraph/ng_sep.cpp
    src/nfagraph/ng_sep.h
    src/nfagraph/ng_small_literal_set.cpp
@ -893,6 +900,8 @@ SET (hs_SRCS
    src/parser/buildstate.h
    src/parser/check_refs.cpp
    src/parser/check_refs.h
+    src/parser/control_verbs.cpp
+    src/parser/control_verbs.h
    src/parser/parse_error.cpp
    src/parser/parse_error.h
    src/parser/parser_util.cpp
@ -928,6 +937,8 @@ SET (hs_SRCS
    src/rose/rose_build_compile.cpp
    src/rose/rose_build_convert.cpp
    src/rose/rose_build_convert.h
+    src/rose/rose_build_dedupe.cpp
+    src/rose/rose_build_engine_blob.cpp
    src/rose/rose_build_engine_blob.h
    src/rose/rose_build_exclusive.cpp
    src/rose/rose_build_exclusive.h
@ -936,6 +947,10 @@ SET (hs_SRCS
    src/rose/rose_build_impl.h
    src/rose/rose_build_infix.cpp
    src/rose/rose_build_infix.h
+    src/rose/rose_build_instructions.cpp
+    src/rose/rose_build_instructions.h
+    src/rose/rose_build_lit_accel.cpp
+    src/rose/rose_build_lit_accel.h
    src/rose/rose_build_long_lit.cpp
    src/rose/rose_build_long_lit.h
    src/rose/rose_build_lookaround.cpp
@ -947,6 +962,7 @@ SET (hs_SRCS
    src/rose/rose_build_misc.cpp
    src/rose/rose_build_program.cpp
    src/rose/rose_build_program.h
+    src/rose/rose_build_resources.h
    src/rose/rose_build_role_aliasing.cpp
    src/rose/rose_build_scatter.cpp
    src/rose/rose_build_scatter.h
@ -982,8 +998,12 @@ SET (hs_SRCS
    src/util/fatbit_build.h
    src/util/graph.h
    src/util/hash.h
+    src/util/hash_dynamic_bitset.h
+    src/util/math.h
    src/util/multibit_build.cpp
    src/util/multibit_build.h
+    src/util/noncopyable.h
+    src/util/operators.h
    src/util/order_check.h
    src/util/partial_store.h
    src/util/partitioned_set.h
@ -993,6 +1013,7 @@ SET (hs_SRCS
    src/util/report_manager.cpp
    src/util/report_manager.h
    src/util/simd_utils.h
+    src/util/small_vector.h
    src/util/target_info.cpp
    src/util/target_info.h
    src/util/ue2_containers.h
@ -1048,8 +1069,6 @@ set(hs_dump_SRCS
    src/rose/rose_build_dump.h
    src/rose/rose_in_dump.cpp
    src/rose/rose_in_dump.h
-    src/rose/rose_dump.cpp
-    src/rose/rose_dump.h
    src/util/dump_charclass.cpp
    src/util/dump_charclass.h
    src/util/dump_util.cpp
@ -1074,10 +1093,14 @@ if (NOT FAT_RUNTIME)
        set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
    endif()

-    add_library(hs_exec OBJECT ${hs_exec_SRCS})
+    if (BUILD_STATIC_LIBS)
+        add_library(hs_exec OBJECT ${hs_exec_SRCS})

-    add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
-    set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+        add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+        set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+
+        add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+    endif (BUILD_STATIC_LIBS)

    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
        add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
@ -1085,51 +1108,98 @@ if (NOT FAT_RUNTIME)
    endif()

 else (FAT_RUNTIME)
+
    set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
-    add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
-    set_target_properties(hs_exec_core2 PROPERTIES
-        COMPILE_FLAGS "-march=core2"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
-
-    add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
-    set_target_properties(hs_exec_corei7 PROPERTIES
-        COMPILE_FLAGS "-march=corei7"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
-
-    add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-    set_target_properties(hs_exec_avx2 PROPERTIES
-        COMPILE_FLAGS "-march=core-avx2"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
-
-    add_library(hs_exec_common OBJECT
-        ${hs_exec_common_SRCS}
-        src/dispatcher.c
-        )
+    if (NOT BUILD_AVX512)
+        set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
+    endif (NOT BUILD_AVX512)
    set_source_files_properties(src/dispatcher.c PROPERTIES
-        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function")
+        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
+
+    if (BUILD_STATIC_LIBS)
+       add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
+       set_target_properties(hs_exec_core2 PROPERTIES
+           COMPILE_FLAGS "-march=core2"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+
+       add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
+       set_target_properties(hs_exec_corei7 PROPERTIES
+           COMPILE_FLAGS "-march=corei7"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+
+       add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx2>)
+       set_target_properties(hs_exec_avx2 PROPERTIES
+           COMPILE_FLAGS "-march=core-avx2"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+       if (BUILD_AVX512)
+           add_library(hs_exec_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+           list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512>)
+           set_target_properties(hs_exec_avx512 PROPERTIES
+               COMPILE_FLAGS "${SKYLAKE_FLAG}"
+               RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+               )
+       endif (BUILD_AVX512)
+
+       add_library(hs_exec_common OBJECT
+           ${hs_exec_common_SRCS}
+           src/dispatcher.c
+           )
+
+       # hs_version.c is added explicitly to avoid some build systems that refuse to
+       # create a lib without any src (I'm looking at you Xcode)
+
+       add_library(hs_runtime STATIC src/hs_version.c
+           $<TARGET_OBJECTS:hs_exec_common>
+           ${RUNTIME_LIBS})
+       set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+
+       # we want the static lib for testing
+       add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
+           ${hs_SRCS}
+           $<TARGET_OBJECTS:hs_exec_common>
+           ${RUNTIME_LIBS})
+
+    endif (BUILD_STATIC_LIBS)

    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+        # build shared libs
        add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
        set_target_properties(hs_exec_shared_core2 PROPERTIES
            COMPILE_FLAGS "-march=core2"
            POSITION_INDEPENDENT_CODE TRUE
            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
            )
        add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
        set_target_properties(hs_exec_shared_corei7 PROPERTIES
            COMPILE_FLAGS "-march=corei7"
            POSITION_INDEPENDENT_CODE TRUE
            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
            )
        add_library(hs_exec_shared_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx2>)
        set_target_properties(hs_exec_shared_avx2 PROPERTIES
            COMPILE_FLAGS "-march=core-avx2"
            POSITION_INDEPENDENT_CODE TRUE
            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
            )
+
+        if (BUILD_AVX512)
+            add_library(hs_exec_shared_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512>)
+            set_target_properties(hs_exec_shared_avx512 PROPERTIES
+                COMPILE_FLAGS "${SKYLAKE_FLAG}"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+        endif (BUILD_AVX512)
        add_library(hs_exec_common_shared OBJECT
        ${hs_exec_common_SRCS}
        src/dispatcher.c
@ -1140,31 +1210,21 @@ else (FAT_RUNTIME)
    endif() # SHARED


-# hs_version.c is added explicitly to avoid some build systems that refuse to
-# create a lib without any src (I'm looking at you Xcode)
-
-    add_library(hs_runtime STATIC src/hs_version.c
-        $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
-        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
 endif (NOT FAT_RUNTIME)

-
-set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 if (NOT BUILD_SHARED_LIBS)
-    install(TARGETS hs_runtime DESTINATION lib)
+    install(TARGETS hs_runtime DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()

 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
    if (NOT FAT_RUNTIME)
-        add_library(hs_runtime_shared SHARED src/hs_version.c src/hs_valid_platform.c
-$<TARGET_OBJECTS:hs_exec_shared>)
-            else()
+        add_library(hs_runtime_shared SHARED src/hs_version.c
+            src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>)
+    else()
        add_library(hs_runtime_shared SHARED src/hs_version.c
            src/hs_valid_platform.c
            $<TARGET_OBJECTS:hs_exec_common_shared>
-            $<TARGET_OBJECTS:hs_exec_shared_core2>
-            $<TARGET_OBJECTS:hs_exec_shared_corei7>
-            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+            ${RUNTIME_SHLIBS})
    endif()
    set_target_properties(hs_runtime_shared PROPERTIES
        VERSION ${LIB_VERSION}
@ -1173,24 +1233,17 @@ $<TARGET_OBJECTS:hs_exec_shared>)
        MACOSX_RPATH ON
        LINKER_LANGUAGE C)
    install(TARGETS hs_runtime_shared
-        RUNTIME DESTINATION bin
-        ARCHIVE DESTINATION lib
-        LIBRARY DESTINATION lib)
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()

-if (NOT FAT_RUNTIME)
-    add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
-else()
-    # we want the static lib for testing
-    add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
-        ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
-        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
-endif()
-
-add_dependencies(hs ragel_Parser)
+if (BUILD_STATIC_LIBS)
+    add_dependencies(hs ragel_Parser)
+endif ()

 if (NOT BUILD_SHARED_LIBS)
-install(TARGETS hs DESTINATION lib)
+    install(TARGETS hs DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()

 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
@ -1200,9 +1253,7 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
    else()
        add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common_shared>
-            $<TARGET_OBJECTS:hs_exec_shared_core2>
-            $<TARGET_OBJECTS:hs_exec_shared_corei7>
-            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+            ${RUNTIME_SHLIBS})

    endif()
    add_dependencies(hs_shared ragel_Parser)
@ -1212,11 +1263,18 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
        SOVERSION ${LIB_SOVERSION}
        MACOSX_RPATH ON)
 install(TARGETS hs_shared
-    RUNTIME DESTINATION bin
-    ARCHIVE DESTINATION lib
-    LIBRARY DESTINATION lib)
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()

+# used by tools and other targets
+if (NOT BUILD_STATIC_LIBS)
+    # use shared lib without having to change all the targets
+    add_library(hs ALIAS hs_shared)
+endif ()
+
+
 if(NOT WIN32)
    add_subdirectory(examples)
 endif()
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -10,8 +10,24 @@ else ()
    message (FATAL_ERROR "No intrinsics header found")
 endif ()

+if (BUILD_AVX512)
+    CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
+    if (NOT HAS_ARCH_SKYLAKE)
+        message (FATAL_ERROR "AVX512 not supported by compiler")
+    endif ()
+endif ()

-set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
+if (FAT_RUNTIME)
+    # test the highest level microarch to make sure everything works
+    if (BUILD_AVX512)
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
+    else ()
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
+    endif ()
+else (NOT FAT_RUNTIME)
+    # if not fat runtime, then test given cflags
+    set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
+endif ()

 # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@ -31,5 +47,39 @@ int main(){
    (void)_mm256_xor_si256(z, z);
 }" HAVE_AVX2)

+# and now for AVX512
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512BW__)
+#error no avx512bw
+#endif
+
+int main(){
+    __m512i z = _mm512_setzero_si512();
+    (void)_mm512_abs_epi8(z);
+}" HAVE_AVX512)
+
+if (FAT_RUNTIME)
+    if (NOT HAVE_SSSE3)
+        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+    endif ()
+    if (NOT HAVE_AVX2)
+        message(FATAL_ERROR "AVX2 support required to build fat runtime")
+    endif ()
+    if (BUILD_AVX512 AND NOT HAVE_AVX512)
+        message(FATAL_ERROR "AVX512 support requested but not supported")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT HAVE_AVX2)
+        message(STATUS "Building without AVX2 support")
+    endif ()
+    if (NOT HAVE_AVX512)
+        message(STATUS "Building without AVX512 support")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT HAVE_SSSE3)
+        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+    endif ()
+endif ()
+
 unset (CMAKE_REQUIRED_FLAGS)
 unset (INTRIN_INC_H)
--- a/cmake/backtrace.cmake
+++ b/cmake/backtrace.cmake
@ -45,10 +45,12 @@ if(HAVE_BACKTRACE)
    if(HAS_RDYNAMIC)
        list(INSERT BACKTRACE_LDFLAGS 0 -rdynamic)
    endif()
-    # cmake scope fun
-    set(HAVE_BACKTRACE ${HAVE_BACKTRACE} PARENT_SCOPE)
 else()
    set(BACKTRACE_CFLAGS "")
    set(BACKTRACE_LDFLAGS "")
 endif()

+# cmake scope fun
+set(HAVE_BACKTRACE ${HAVE_BACKTRACE} CACHE BOOL INTERNAL)
+set(BACKTRACE_CFLAGS ${BACKTRACE_CFLAGS} CACHE STRING INTERNAL)
+set(BACKTRACE_LDFLAGS ${BACKTRACE_LDFLAGS} CACHE STRING INTERNAL)
--- a/cmake/boost.cmake
+++ b/cmake/boost.cmake
@ -1,3 +1,31 @@
+# Various checks related to Boost
+
+set(BOOST_USE_STATIC_LIBS OFF)
+set(BOOST_USE_MULTITHREADED OFF)
+set(BOOST_USE_STATIC_RUNTIME OFF)
+if (HAVE_LIBCPP)
+    # we need a more recent boost for libc++
+    set(BOOST_MINVERSION 1.61.0)
+else ()
+    set(BOOST_MINVERSION 1.57.0)
+endif ()
+set(BOOST_NO_BOOST_CMAKE ON)
+
+unset(Boost_INCLUDE_DIR CACHE)
+# we might have boost in tree, so provide a hint and try again
+set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
+find_package(Boost ${BOOST_MINVERSION} QUIET)
+if(NOT Boost_FOUND)
+    # otherwise check for Boost installed on the system
+    unset(BOOST_INCLUDEDIR)
+    find_package(Boost ${BOOST_MINVERSION} QUIET)
+    if(NOT Boost_FOUND)
+        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
+    endif()
+endif()
+
+message(STATUS "Boost version: ${Boost_MAJOR_VERSION}.${Boost_MINOR_VERSION}.${Boost_SUBMINOR_VERSION}")
+
 # Boost 1.62 has a bug that we've patched around, check if it is required
 if (Boost_VERSION EQUAL 106200)
    set (CMAKE_REQUIRED_INCLUDES ${BOOST_INCLUDEDIR} "${PROJECT_SOURCE_DIR}/include")
@ -38,4 +66,7 @@ ${BOOST_REV_TEST}" BOOST_REVGRAPH_PATCH)
    endif()

    unset (CMAKE_REQUIRED_INCLUDES)
+else ()
+    unset(BOOST_REVGRAPH_OK CACHE)
+    unset(BOOST_REVGRAPH_PATCH CACHE)
 endif () # Boost 1.62.0
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@ -1,27 +1,28 @@
 #!/bin/sh -e
 # This is used for renaming symbols for the fat runtime, don't call directly
 # TODO: make this a lot less fragile!
+cleanup () {
+    rm -f ${SYMSFILE} ${KEEPSYMS}
+}
+
 PREFIX=$1
 KEEPSYMS_IN=$2
 shift 2
-BUILD=$@
-OUT=$(echo $BUILD | sed 's/.* -o \(.*\.o\).*/\1/')
-SYMSFILE=/tmp/${PREFIX}_rename.syms.$$
-KEEPSYMS=/tmp/keep.syms.$$
-# grab the command without the target obj or src file flags
-# we don't just call gcc directly as there may be flags modifying the arch
-CC_CMD=$(echo $BUILD | sed 's/ -o .*\.o//;s/ -c //;s/ .[^ ]*\.c//;')
-# find me a libc
-LIBC_SO=$(${CC_CMD} --print-file-name=libc.so.6)
+# $@ contains the actual build command
+OUT=$(echo "$@" | sed 's/.* -o \(.*\.o\).*/\1/')
+trap cleanup INT QUIT EXIT
+SYMSFILE=$(mktemp --tmpdir ${PREFIX}_rename.syms.XXXXX)
+KEEPSYMS=$(mktemp --tmpdir keep.syms.XXXXX)
+# find the libc used by gcc
+LIBC_SO=$("$@" --print-file-name=libc.so.6)
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
 nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
-${BUILD}
+"$@"
 # rename the symbols in the object
 nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
 if test -s ${SYMSFILE}
 then
    objcopy --redefine-syms=${SYMSFILE} ${OUT}
 fi
-rm -f ${SYMSFILE} ${KEEPSYMS}
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -1,5 +1,8 @@
 /* used by cmake */

+#ifndef CONFIG_H_
+#define CONFIG_H_
+
 /* "Define if the build is 32 bit" */
 #cmakedefine ARCH_32_BIT

@ -43,6 +46,8 @@
   0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

+#cmakedefine HAVE_PTHREAD_NP_H
+
 /* Define to 1 if you have the `malloc_info' function. */
 #cmakedefine HAVE_MALLOC_INFO

@ -76,6 +81,9 @@
 /* Define to 1 if you have the `_aligned_malloc' function. */
 #cmakedefine HAVE__ALIGNED_MALLOC

+/* Define if compiler has __builtin_constant_p */
+#cmakedefine HAVE__BUILTIN_CONSTANT_P
+
 /* Optimize, inline critical functions */
 #cmakedefine HS_OPTIMIZE

@ -91,3 +99,5 @@

 /* define if reverse_graph requires patch for boost 1.62.0 */
 #cmakedefine BOOST_REVGRAPH_PATCH
+
+#endif /* CONFIG_H_ */
--- a/cmake/sqlite3.cmake
+++ b/cmake/sqlite3.cmake
@ -22,7 +22,7 @@ if (NOT SQLITE3_FOUND)
        set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
        set(SQLITE3_LDFLAGS sqlite3_static)
    else()
-        message(FATAL_ERROR "  no sqlite3 in source tree")
+        message(STATUS "  no sqlite3 in source tree")
    endif()
 endif()

--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -64,7 +64,7 @@ libpcre are supported. The use of unsupported constructs will result in
 compilation errors.

 The version of PCRE used to validate Hyperscan's interpretation of this syntax
-is 8.38.
+is 8.40.

 ====================
 Supported Constructs
@ -171,6 +171,8 @@ The following regex constructs are not supported by Hyperscan:
 * Callouts and embedded code.
 * Atomic grouping and possessive quantifiers.

+.. _semantics:
+
 *********
 Semantics
 *********
@ -284,16 +286,24 @@ which provides the following fields:
  expression should match successfully.
 * ``min_length``: The minimum match length (from start to end) required to
  successfully match this expression.
+* ``edit_distance``: Match this expression within a given Levenshtein distance.

-These parameters allow the set of matches produced by a pattern to be
-constrained at compile time, rather than relying on the application to process
-unwanted matches at runtime.
+These parameters either allow the set of matches produced by a pattern to be
+constrained at compile time (rather than relying on the application to process
+unwanted matches at runtime), or allow matching a pattern approximately (within
+a given edit distance) to produce more matches.

 For example, the pattern :regexp:`/foo.*bar/` when given a ``min_offset`` of 10
 and a ``max_offset`` of 15 will not produce matches when scanned against
 ``foobar`` or ``foo0123456789bar`` but will produce a match against the data
 streams ``foo0123bar`` or ``foo0123456bar``.

+Similarly, the pattern :regexp:`/foobar/` when given an ``edit_distance`` of 2
+will produce matches when scanned against ``foobar``, ``fooba``, ``fobr``,
+``fo_baz``, ``foooobar``, and anything else that lies within edit distance of 2
+(as defined by Levenshtein distance). For more details, see the
+:ref:`approximate_matching` section.
+
 =================
 Prefiltering Mode
 =================
@ -375,3 +385,74 @@ An :c:type:`hs_platform_info_t` structure targeted at the current host can be
 built with the :c:func:`hs_populate_platform` function.

 See :ref:`api_constants` for the full list of CPU tuning and feature flags.
+
+.. _approximate_matching:
+
+********************
+Approximate matching
+********************
+
+Hyperscan provides an experimental approximate matching mode, which will match
+patterns within a given edit distance. The exact matching behavior is defined as
+follows:
+
+#. **Edit distance** is defined as Levenshtein distance. That is, there are
+   three possible edit types considered: insertion, removal and substitution.
+   More formal description can be found on
+   `Wikipedia <https://en.wikipedia.org/wiki/Levenshtein_distance>`_.
+
+#. **Approximate matching** will match all *corpora* within a given edit
+   distance. That is, given a pattern, approximate matching will match anything
+   that can be edited to arrive at a corpus that exactly matches the original
+   pattern.
+
+#. **Matching semantics** are exactly the same as described in :ref:`semantics`.
+
+Here are a few examples of approximate matching:
+
+* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
+  matching behavior. With approximate matching within edit distance 2, the
+  pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
+  ``f``, and anything else that lies within edit distance 2 of matching corpora
+  for the original pattern (``foo`` in this case).
+
+* Pattern :regexp:`/foo(bar)+/` with edit distance 1 will match ``foobarbar``,
+  ``foobarb0r``, ``fooarbar``, ``foobarba``, ``f0obarbar``, ``fobarbar`` and
+  anything else that lies within edit distance 1 of matching corpora for the
+  original pattern (``foobarbar`` in this case).
+
+* Pattern :regexp:`/foob?ar/` with edit distance 2 will match ``fooar``,
+  ``foo``, ``fabar``, ``oar`` and anything else that lies within edit distance 2
+  of matching corpora for the original pattern (``fooar`` in this case).
+
+Currently, there are trade-offs and limitations that come with approximate
+matching support. Here they are, in a nutshell:
+
+* Reduced pattern support:
+
+  * For many patterns, approximate matching is complex and can result in
+    Hyperscan failing to compile a pattern with a "Pattern too large" error,
+    even if the pattern is supported in normal operation.
+  * Additionally, some patterns cannot be approximately matched because they
+    reduce to so-called "vacuous" patterns (patterns that match everything). For
+    example, pattern :regexp:`/foo/` with edit distance 3, if implemented,
+    would reduce to matching zero-length buffers. Such patterns will result in a
+    "Pattern cannot be approximately matched" compile error.
+  * Finally, due to the inherent complexities of defining matching behavior,
+    approximate matching implements a reduced subset of regular expression
+    syntax. Approximate matching does not support UTF-8 (and other
+    multibyte character encodings), and word boundaries (that is, ``\b``, ``\B``
+    and other equivalent constructs). Patterns containing unsupported constructs
+    will result in "Pattern cannot be approximately matched" compile error.
+  * When using approximate matching in conjunction with SOM, all of the
+    restrictions of SOM also apply. See :ref:`som` for more
+    details.
+* Increased stream state/byte code size requirements: due to approximate
+  matching byte code being inherently larger and more complex than exact
+  matching, the corresponding requirements also increase.
+* Performance overhead: similarly, there is generally a performance cost
+  associated with approximate matching, both due to increased matching
+  complexity, and due to the fact that it will produce more matches.
+
+Approximate matching is always disabled by default, and can be enabled on a
+per-pattern basis by using an extended parameter described in :ref:`extparam`.
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@ -44,7 +44,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'Hyperscan'
-copyright = u'2015-2016, Intel Corporation'
+copyright = u'2015-2017, Intel Corporation'

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
--- a/doc/dev-reference/copyright.rst
+++ b/doc/dev-reference/copyright.rst
@ -30,4 +30,4 @@ and/or other countries.

 \*Other names and brands may be claimed as the property of others.

-Copyright |copy| 2015-2016, Intel Corporation. All rights reserved.
+Copyright |copy| 2015-2017, Intel Corporation. All rights reserved.
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@ -254,18 +254,32 @@ the current platform is supported by Hyperscan.
 At of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:

-+----------+-------------------------------+---------------------+
-| Variant  | CPU Feature Flag(s) Required  | gcc arch flag       |
-+==========+===============================+=====================+
-| Core 2   | ``SSSE3``                     | ``-march=core2``    |
-+----------+-------------------------------+---------------------+
-| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``   |
-+----------+-------------------------------+---------------------+
-| AVX 2    | ``AVX2``                      | ``-march=avx2``     |
-+----------+-------------------------------+---------------------+
+----------+-------------------------------+---------------------------+
+| Variant  | CPU Feature Flag(s) Required  | gcc arch flag             |
+==========+===============================+===========================+
+| Core 2   | ``SSSE3``                     | ``-march=core2``          |
+----------+-------------------------------+---------------------------+
+| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``         |
+----------+-------------------------------+---------------------------+
+| AVX 2    | ``AVX2``                      | ``-march=core-avx2``      |
+----------+-------------------------------+---------------------------+
+| AVX 512  | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
+----------+-------------------------------+---------------------------+

-As this requires compiler, libc, and binutils support, at this time the fat
-runtime will only be enabled for Linux builds where the compiler supports the
+.. note::
+
+    Hyperscan v4.5 adds support for AVX-512 instructions - in particular the
+    ``AVX-512BW`` instruction set that was introduced on Intel "Skylake" Xeon
+    processors - however the AVX-512 runtime variant is **not** enabled by
+    default in fat runtime builds as not all toolchains support AVX-512
+    instruction sets. To build an AVX-512 runtime, the CMake variable
+    ``BUILD_AVX512`` must be enabled manually during configuration. For
+    example: ::
+
+        cmake -DBUILD_AVX512=on <...>
+
+As the fat runtime requires compiler, libc, and binutils support, at this time
+it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
 <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-indirect-functions-3321>`_.

--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@ -17,5 +17,6 @@ Hyperscan |version| Developer's Reference Guide
   runtime
   serialization
   performance
+   tools
   api_constants
   api_files
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@ -70,6 +70,13 @@ For a given database, Hyperscan provides several guarantees:

 See :ref:`runtime` for more detail.

+*****
+Tools
+*****
+
+Some utilities for testing and benchmarking Hyperscan are included with the
+library. See :ref:`tools` for more information.
+
 ************
 Example Code
 ************
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@ -333,3 +333,13 @@ Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
 used to specify a lower bound on the length of the matches for a pattern. Using
 this facility may be more lightweight in some circumstances than using the SOM
 flag and post-confirming match length in the calling application.
+
+********************
+Approximate matching
+********************
+
+.. tip:: Approximate matching is an experimental feature.
+
+There is generally a performance impact associated with approximate matching due
+to the reduced specificity of the matches. This impact may vary significantly
+depending on the pattern and edit distance.
--- a/doc/dev-reference/tools.rst
+++ b/doc/dev-reference/tools.rst
@ -0,0 +1,116 @@
+.. _tools:
+
+#####
+Tools
+#####
+
+This section describes the set of utilities included with the Hyperscan library.
+
+********************
+Benchmarker: hsbench
+********************
+
+The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
+for a particular set of patterns and corpus of data to be scanned.
+
+Patterns are supplied in the format described below in
+:ref:`tools_pattern_format`, while the corpus must be provided in the form of a
+`corpus database`: this is a simple SQLite database format intended to allow for
+easy control of how a corpus is broken into blocks and streams.
+
+.. note:: A group of Python scripts for constructing corpora databases from
+   various input types, such as PCAP network traffic captures or text files, can
+   be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
+
+Running hsbench
+===============
+
+Given a file full of patterns specified with ``-e`` and a corpus database
+specified with ``-c``, ``hsbench`` will perform a single-threaded benchmark and
+produce output like this::
+
+    $ hsbench -e /tmp/patterns -c /tmp/corpus.db
+
+    Signatures:        /tmp/patterns
+    Hyperscan info:    Version: 4.3.1 Features:  AVX2 Mode: STREAM
+    Expression count:  200
+    Bytecode size:     342,540 bytes
+    Database CRC:      0x6cd6b67c
+    Stream state size: 252 bytes
+    Scratch size:      18,406 bytes
+    Compile time:      0.153 seconds
+    Peak heap usage:   78,073,856 bytes
+
+    Time spent scanning:     0.600 seconds
+    Corpus size:             72,138,183 bytes (63,946 blocks in 8,891 streams)
+    Scan matches:            81 (0.001 matches/kilobyte)
+    Overall block rate:      2,132,004.45 blocks/sec
+    Overall throughput:      19,241.10 Mbit/sec
+
+By default, the corpus is scanned twenty times, and the overall performance
+reported is computed based the total number of bytes scanned in the time it
+takes to perform all twenty scans. The number of repeats can be changed with the
+``-n`` argument, and the results of each scan will be displayed if the
+``--per-scan`` argument is specified.
+
+To benchmark Hyperscan on more than one core, you can supply a list of cores
+with the ``-T`` argument, which will instruct ``hsbench`` to start one
+benchmark thread per core given and compute the throughput from the time taken
+to complete all of them.
+
+.. tip:: For single-threaded benchmarks on multi-processor systems, we recommend
+   using a utility like ``taskset`` to lock the hsbench process to one core and
+   minimize jitter due to the operating system's scheduler.
+
+.. _tools_pattern_format:
+
+**************
+Pattern Format
+**************
+
+All of the Hyperscan tools accept patterns in the same format, read from plain
+text files with one pattern per line. Each line looks like this:
+
+* ``<integer id>:/<regex>/<flags>``
+
+For example::
+
+    1:/hatstand.*teakettle/s
+    2:/(hatstand|teakettle)/iH
+    3:/^.{10,20}hatstand/m
+
+The integer ID is the value that will be reported when a match is found by
+Hyperscan and must be unique.
+
+The pattern itself is a regular expression in PCRE syntax; see
+:ref:`compilation` for more information on supported features.
+
+The flags are single characters that map to Hyperscan flags as follows:
+
+=========   =================================    ===========
+Character   API Flag                             Description
+=========   =================================    ===========
+``i``       :c:member:`HS_FLAG_CASELESS`         Case-insensitive matching
+``s``       :c:member:`HS_FLAG_DOTALL`           Dot (``.``) will match newlines
+``m``       :c:member:`HS_FLAG_MULTILINE`        Multi-line anchoring
+``H``       :c:member:`HS_FLAG_SINGLEMATCH`      Report match ID at most once
+``V``       :c:member:`HS_FLAG_ALLOWEMPTY`       Allow patterns that can match against empty buffers
+``8``       :c:member:`HS_FLAG_UTF8`             UTF-8 mode
+``W``       :c:member:`HS_FLAG_UCP`              Unicode property support
+``P``       :c:member:`HS_FLAG_PREFILTER`        Prefiltering mode
+``L``       :c:member:`HS_FLAG_SOM_LEFTMOST`     Leftmost start of match reporting
+=========   =================================    ===========
+
+In addition to the set of flags above, :ref:`extparam` can be supplied
+for each pattern. These are supplied after the flags as ``key=value`` pairs
+between braces, separated by commas. For example::
+
+    1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
+
+All Hyperscan tools will accept a pattern file (or a directory containing
+pattern files) with the ``-e`` argument. If no further arguments constraining
+the pattern set are given, all patterns in those files are used.
+
+To select a subset of the patterns, a single ID can be supplied with the ``-z``
+argument, or a file containing a set of IDs can be supplied with the ``-s``
+argument.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -22,3 +22,6 @@ set_source_files_properties(patbench.cc PROPERTIES COMPILE_FLAGS
    "-Wall -Wno-unused-parameter")
 target_link_libraries(patbench hs pcap)
 endif()
+
+install(FILES simplegrep.c pcapscan.cc patbench.cc README.md
+        DESTINATION ${CMAKE_INSTALL_DOCDIR}/examples)
--- a/src/alloc.c
+++ b/src/alloc.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -67,7 +67,7 @@ hs_free_t normalise_free(hs_free_t f) {
 }

 HS_PUBLIC_API
-hs_error_t hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
    hs_set_database_allocator(allocfunc, freefunc);
    hs_set_misc_allocator(allocfunc, freefunc);
    hs_set_stream_allocator(allocfunc, freefunc);
@ -77,7 +77,8 @@ hs_error_t hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }

 HS_PUBLIC_API
-hs_error_t hs_set_database_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t allocfunc,
+                                              hs_free_t freefunc) {
    hs_database_alloc = normalise_alloc(allocfunc);
    hs_database_free = normalise_free(freefunc);

@ -85,7 +86,8 @@ hs_error_t hs_set_database_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }

 HS_PUBLIC_API
-hs_error_t hs_set_misc_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t allocfunc,
+                                          hs_free_t freefunc) {
    hs_misc_alloc = normalise_alloc(allocfunc);
    hs_misc_free = normalise_free(freefunc);

@ -93,7 +95,8 @@ hs_error_t hs_set_misc_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }

 HS_PUBLIC_API
-hs_error_t hs_set_scratch_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t allocfunc,
+                                             hs_free_t freefunc) {
    hs_scratch_alloc = normalise_alloc(allocfunc);
    hs_scratch_free = normalise_free(freefunc);

@ -101,7 +104,8 @@ hs_error_t hs_set_scratch_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }

 HS_PUBLIC_API
-hs_error_t hs_set_stream_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t allocfunc,
+                                            hs_free_t freefunc) {
    hs_stream_alloc = normalise_alloc(allocfunc);
    hs_stream_free = normalise_free(freefunc);

--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,6 +42,8 @@
 * word-to-word and word-to-nonword) are dropped.
 */
 #include "asserts.h"
+
+#include "compiler/compiler.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_redundancy.h"
@ -115,8 +117,8 @@ u32 conjunct(u32 flags1, u32 flags2) {
 typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;

 static
-void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
-                         u32 &assert_edge_count) {
+void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
+                         edge_cache_t &edge_cache, u32 &assert_edge_count) {
    DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);

    const u32 flags = g[t].assert_flags;
@ -178,8 +180,7 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
                edge_cache.emplace(cache_key, e);
                g[e].assert_flags = flags;
                if (++assert_edge_count > MAX_ASSERT_EDGES) {
-                    throw CompileError(g.expressionIndex,
-                                       "Pattern is too large.");
+                    throw CompileError(expr.index, "Pattern is too large.");
                }
            } else {
                NFAEdge e = ecit->second;
@ -200,21 +201,23 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
 }

 static
-void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
+void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                 NFAVertex v, s32 adj) {
    // Don't try and set the report ID of a special vertex.
    assert(!is_special(v, g));

    // There should be no reports set already.
    assert(g[v].reports.empty());

-    Report r = rm.getBasicInternalReport(g, adj);
+    Report r = rm.getBasicInternalReport(expr, adj);

    g[v].reports.insert(rm.getInternalId(r));
    DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
 }

 static
-void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
+void checkForMultilineStart(ReportManager &rm, NGHolder &g,
+                            const ExpressionInfo &expr) {
    vector<NFAEdge> dead;
    for (auto v : adjacent_vertices_range(g.start, g)) {
        if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
@ -238,7 +241,7 @@ void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
    for (const auto &e : dead) {
        NFAVertex dummy = add_vertex(g);
        g[dummy].char_reach.setall();
-        setReportId(rm, g, dummy, -1);
+        setReportId(rm, g, expr, dummy, -1);
        add_edge(source(e, g), dummy, g[e], g);
        add_edge(dummy, g.accept, g);
    }
@ -263,7 +266,8 @@ bool hasAssertVertices(const NGHolder &g) {
 * Remove the horrors that are the temporary assert vertices which arise from
 * our construction method. Allows the rest of our code base to live in
 * blissful ignorance of their existence. */
-void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
+void removeAssertVertices(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr) {
    size_t num = 0;

    DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
@ -285,12 +289,12 @@ void removeAssertVertices(ReportManager &rm, NGWrapper &g) {

    for (auto v : vertices_range(g)) {
        if (g[v].assert_flags & WORDBOUNDARY_FLAGS) {
-            replaceAssertVertex(g, v, edge_cache, assert_edge_count);
+            replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count);
            num++;
        }
    }

-    checkForMultilineStart(rm, g);
+    checkForMultilineStart(rm, g, expr);

    if (num) {
        DEBUG_PRINTF("resolved %zu assert vertices\n", num);
--- a/src/compiler/asserts.h
+++ b/src/compiler/asserts.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -35,8 +35,9 @@

 namespace ue2 {

+class ExpressionInfo;
 class ReportManager;
-class NGWrapper;
+class NGHolder;

 /** \brief Convert temporary assert vertices (from construction method) to
 * edge-based flags.
@ -44,7 +45,8 @@ class NGWrapper;
 * Remove the horrors that are the temporary assert vertices which arise from
 * our construction method. Allows the rest of our code base to live in
 * blissful ignorance of their existence. */
-void removeAssertVertices(ReportManager &rm, NGWrapper &g);
+void removeAssertVertices(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr);

 } // namespace ue2

--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -55,9 +55,8 @@
 #include "parser/unsupported.h"
 #include "parser/utf8_validate.h"
 #include "rose/rose_build.h"
-#include "rose/rose_build_dump.h"
 #include "som/slot_manager_dump.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/compile_error.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"
@ -74,12 +73,12 @@ using namespace std;

 namespace ue2 {

-
 static
 void validateExt(const hs_expr_ext &ext) {
    static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
                                                    HS_EXT_FLAG_MAX_OFFSET |
-                                                    HS_EXT_FLAG_MIN_LENGTH;
+                                                    HS_EXT_FLAG_MIN_LENGTH |
+                                                    HS_EXT_FLAG_EDIT_DISTANCE;
    if (ext.flags & ~ALL_EXT_FLAGS) {
        throw CompileError("Invalid hs_expr_ext flag set.");
    }
@ -100,25 +99,18 @@ void validateExt(const hs_expr_ext &ext) {
 }

 ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
-                                   unsigned flags, ReportID actionId,
+                                   unsigned flags, ReportID report,
                                   const hs_expr_ext *ext)
-    : utf8(false),
-      allow_vacuous(flags & HS_FLAG_ALLOWEMPTY),
-      highlander(flags & HS_FLAG_SINGLEMATCH),
-      prefilter(flags & HS_FLAG_PREFILTER),
-      som(SOM_NONE),
-      index(index_in),
-      id(actionId),
-      min_offset(0),
-      max_offset(MAX_OFFSET),
-      min_length(0) {
+    : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
+           false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
+           0, 0) {
    ParseMode mode(flags);

    component = parse(expression, mode);

-    utf8 = mode.utf8; /* utf8 may be set by parse() */
+    expr.utf8 = mode.utf8; /* utf8 may be set by parse() */

-    if (utf8 && !isValidUtf8(expression)) {
+    if (expr.utf8 && !isValidUtf8(expression)) {
        throw ParseError("Expression is not valid UTF-8.");
    }

@ -146,7 +138,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,

    // Set SOM type.
    if (flags & HS_FLAG_SOM_LEFTMOST) {
-        som = SOM_LEFT;
+        expr.som = SOM_LEFT;
    }

    // Set extended parameters, if we have them.
@ -155,26 +147,29 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
        validateExt(*ext);

        if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
-            min_offset = ext->min_offset;
+            expr.min_offset = ext->min_offset;
        }
        if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
-            max_offset = ext->max_offset;
+            expr.max_offset = ext->max_offset;
        }
        if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) {
-            min_length = ext->min_length;
+            expr.min_length = ext->min_length;
+        }
+        if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
+            expr.edit_distance = ext->edit_distance;
        }
    }

    // These are validated in validateExt, so an error will already have been
    // thrown if these conditions don't hold.
-    assert(max_offset >= min_offset);
-    assert(max_offset >= min_length);
+    assert(expr.max_offset >= expr.min_offset);
+    assert(expr.max_offset >= expr.min_length);

    // Since prefiltering and SOM aren't supported together, we must squash any
    // min_length constraint as well.
-    if (flags & HS_FLAG_PREFILTER && min_length) {
+    if (flags & HS_FLAG_PREFILTER && expr.min_length) {
        DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n");
-        min_length = 0;
+        expr.min_length = 0;
    }
 }

@ -183,25 +178,25 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
 * \brief Dumps the parse tree to screen in debug mode and to disk in dump
 * mode.
 */
-void dumpExpression(UNUSED const ParsedExpression &expr,
+void dumpExpression(UNUSED const ParsedExpression &pe,
                    UNUSED const char *stage, UNUSED const Grey &grey) {
 #if defined(DEBUG)
-    DEBUG_PRINTF("===== Rule ID: %u (internalID:  %u) =====\n", expr.id,
-                 expr.index);
+    DEBUG_PRINTF("===== Rule ID: %u (expression index: %u) =====\n",
+                 pe.expr.report, pe.expr.index);
    ostringstream debug_tree;
-    dumpTree(debug_tree, expr.component.get());
+    dumpTree(debug_tree, pe.component.get());
    printf("%s\n", debug_tree.str().c_str());
 #endif // DEBUG

 #if defined(DUMP_SUPPORT)
    if (grey.dumpFlags & Grey::DUMP_PARSE) {
        stringstream ss;
-        ss << grey.dumpPath << "Expr_" << expr.index << "_componenttree_"
+        ss << grey.dumpPath << "Expr_" << pe.expr.index << "_componenttree_"
           << stage << ".txt";
        ofstream out(ss.str().c_str());
-        out << "Component Tree for " << expr.id << endl;
-        dumpTree(out, expr.component.get());
-        if (expr.utf8) {
+        out << "Component Tree for " << pe.expr.report << endl;
+        dumpTree(out, pe.component.get());
+        if (pe.expr.utf8) {
            out << "UTF8 mode" << endl;
        }
    }
@ -211,13 +206,13 @@ void dumpExpression(UNUSED const ParsedExpression &expr,

 /** \brief Run Component tree optimisations on \a expr. */
 static
-void optimise(ParsedExpression &expr) {
-    if (expr.min_length || expr.som) {
+void optimise(ParsedExpression &pe) {
+    if (pe.expr.min_length || pe.expr.som) {
        return;
    }

    DEBUG_PRINTF("optimising\n");
-    expr.component->optimise(true /* root is connected to sds */);
+    pe.component->optimise(true /* root is connected to sds */);
 }

 void addExpression(NG &ng, unsigned index, const char *expression,
@ -234,34 +229,34 @@ void addExpression(NG &ng, unsigned index, const char *expression,

    // Do per-expression processing: errors here will result in an exception
    // being thrown up to our caller
-    ParsedExpression expr(index, expression, flags, id, ext);
-    dumpExpression(expr, "orig", cc.grey);
+    ParsedExpression pe(index, expression, flags, id, ext);
+    dumpExpression(pe, "orig", cc.grey);

    // Apply prefiltering transformations if desired.
-    if (expr.prefilter) {
-        prefilterTree(expr.component, ParseMode(flags));
-        dumpExpression(expr, "prefiltered", cc.grey);
+    if (pe.expr.prefilter) {
+        prefilterTree(pe.component, ParseMode(flags));
+        dumpExpression(pe, "prefiltered", cc.grey);
    }

    // Expressions containing zero-width assertions and other extended pcre
    // types aren't supported yet. This call will throw a ParseError exception
    // if the component tree contains such a construct.
-    checkUnsupported(*expr.component);
+    checkUnsupported(*pe.component);

-    expr.component->checkEmbeddedStartAnchor(true);
-    expr.component->checkEmbeddedEndAnchor(true);
+    pe.component->checkEmbeddedStartAnchor(true);
+    pe.component->checkEmbeddedEndAnchor(true);

    if (cc.grey.optimiseComponentTree) {
-        optimise(expr);
-        dumpExpression(expr, "opt", cc.grey);
+        optimise(pe);
+        dumpExpression(pe, "opt", cc.grey);
    }

    DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n",
-                 expr.component.get(), expr.index, expr.id);
+                 pe.component.get(), pe.expr.index, pe.expr.report);

    // You can only use the SOM flags if you've also specified an SOM
    // precision mode.
-    if (expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
+    if (pe.expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
        throw CompileError("To use a SOM expression flag in streaming mode, "
                           "an SOM precision mode (e.g. "
                           "HS_MODE_SOM_HORIZON_LARGE) must be specified.");
@ -269,32 +264,31 @@ void addExpression(NG &ng, unsigned index, const char *expression,

    // If this expression is a literal, we can feed it directly to Rose rather
    // than building the NFA graph.
-    if (shortcutLiteral(ng, expr)) {
+    if (shortcutLiteral(ng, pe)) {
        DEBUG_PRINTF("took literal short cut\n");
        return;
    }

-    unique_ptr<NGWrapper> g = buildWrapper(ng.rm, cc, expr);
-
-    if (!g) {
+    auto built_expr = buildGraph(ng.rm, cc, pe);
+    if (!built_expr.g) {
        DEBUG_PRINTF("NFA build failed on ID %u, but no exception was "
-                     "thrown.\n", expr.id);
+                     "thrown.\n", pe.expr.report);
        throw CompileError("Internal error.");
    }

-    if (!expr.allow_vacuous && matches_everywhere(*g)) {
+    if (!pe.expr.allow_vacuous && matches_everywhere(*built_expr.g)) {
        throw CompileError("Pattern matches empty buffer; use "
                           "HS_FLAG_ALLOWEMPTY to enable support.");
    }

-    if (!ng.addGraph(*g)) {
-        DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", expr.id);
+    if (!ng.addGraph(built_expr.expr, std::move(built_expr.g))) {
+        DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", pe.expr.report);
        throw CompileError("Error compiling expression.");
    }
 }

 static
-aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
+bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
    const u32 minWidth =
        ng.minWidth.is_finite() ? verify_u32(ng.minWidth) : ROSE_BOUND_INF;
    auto rose = ng.rose->buildRose(minWidth);
@ -305,7 +299,6 @@ aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
        return nullptr;
    }

-    dumpRose(*ng.rose, rose.get(), ng.cc.grey);
    dumpReportManager(ng.rm, ng.cc.grey);
    dumpSomSlotManager(ng.ssm, ng.cc.grey);
    dumpSmallWrite(rose.get(), ng.cc.grey);
@ -320,6 +313,9 @@ platform_t target_to_platform(const target_t &target_info) {
    if (!target_info.has_avx2()) {
        p |= HS_PLATFORM_NOAVX2;
    }
+    if (!target_info.has_avx512()) {
+        p |= HS_PLATFORM_NOAVX512;
+    }
    return p;
 }

@ -369,7 +365,7 @@ struct hs_database *build(NG &ng, unsigned int *length) {
    if (!rose) {
        throw CompileError("Unable to generate bytecode.");
    }
-    *length = roseSize(rose.get());
+    *length = rose.size();
    if (!*length) {
        DEBUG_PRINTF("RoseEngine has zero length\n");
        assert(0);
@ -450,41 +446,42 @@ bool isSupported(const Component &c) {
 }
 #endif

-unique_ptr<NGWrapper> buildWrapper(ReportManager &rm, const CompileContext &cc,
-                                   const ParsedExpression &expr) {
-    assert(isSupported(*expr.component));
+BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
+                           const ParsedExpression &pe) {
+    assert(isSupported(*pe.component));

-    const unique_ptr<NFABuilder> builder = makeNFABuilder(rm, cc, expr);
+    const auto builder = makeNFABuilder(rm, cc, pe);
    assert(builder);

    // Set up START and ACCEPT states; retrieve the special states
-    const auto bs = makeGlushkovBuildState(*builder, expr.prefilter);
+    const auto bs = makeGlushkovBuildState(*builder, pe.expr.prefilter);

    // Map position IDs to characters/components
-    expr.component->notePositions(*bs);
+    pe.component->notePositions(*bs);

    // Wire the start dotstar state to the firsts
-    connectInitialStates(*bs, expr);
+    connectInitialStates(*bs, pe);

    DEBUG_PRINTF("wire up body of expr\n");
    // Build the rest of the FOLLOW set
    vector<PositionInfo> initials = {builder->getStartDotStar(),
                                     builder->getStart()};
-    expr.component->buildFollowSet(*bs, initials);
+    pe.component->buildFollowSet(*bs, initials);

    // Wire the lasts to the accept state
-    connectFinalStates(*bs, expr);
+    connectFinalStates(*bs, pe);

    // Create our edges
    bs->buildEdges();

-    auto g = builder->getGraph();
-    assert(g);
+    BuiltExpression built_expr = builder->getGraph();
+    assert(built_expr.g);

-    dumpDotWrapper(*g, "00_before_asserts", cc.grey);
-    removeAssertVertices(rm, *g);
+    dumpDotWrapper(*built_expr.g, built_expr.expr, "00_before_asserts",
+                   cc.grey);
+    removeAssertVertices(rm, *built_expr.g, built_expr.expr);

-    return g;
+    return built_expr;
 }

 } // namespace ue2
--- a/src/compiler/compiler.h
+++ b/src/compiler/compiler.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -35,11 +35,11 @@

 #include "ue2common.h"
 #include "database.h"
+#include "compiler/expression_info.h"
 #include "parser/Component.h"
-#include "som/som.h"
+#include "util/noncopyable.h"

 #include <memory>
-#include <boost/core/noncopyable.hpp>

 struct hs_database;
 struct hs_expr_ext;
@ -50,34 +50,32 @@ struct CompileContext;
 struct Grey;
 struct target_t;
 class NG;
+class NGHolder;
 class ReportManager;
-class NGWrapper;

-/** Class gathering together the pieces of a parsed expression.
- * Note: Owns the provided component.
- */
-class ParsedExpression : boost::noncopyable {
+/** \brief Class gathering together the pieces of a parsed expression. */
+class ParsedExpression : noncopyable {
 public:
    ParsedExpression(unsigned index, const char *expression, unsigned flags,
-                     ReportID actionId, const hs_expr_ext *ext = nullptr);
+                     ReportID report, const hs_expr_ext *ext = nullptr);

-    bool utf8; //!< UTF-8 mode flag specified
+    /** \brief Expression information (from flags, extparam etc) */
+    ExpressionInfo expr;

-    /** \brief root node of parsed component tree. */
-    std::unique_ptr<ue2::Component> component;
+    /** \brief Root node of parsed component tree. */
+    std::unique_ptr<Component> component;
+};

-    const bool allow_vacuous;   //!< HS_FLAG_ALLOWEMPTY specified
-    const bool highlander;      //!< HS_FLAG_SINGLEMATCH specified
-    const bool prefilter;       //!< HS_FLAG_PREFILTER specified
-    som_type som;               //!< chosen SOM mode, or SOM_NONE
+/**
+ * \brief Class gathering together the pieces of an expression that has been
+ * built into an NFA graph.
+ */
+struct BuiltExpression {
+    /** \brief Expression information (from flags, extparam etc) */
+    ExpressionInfo expr;

-    /** \brief index in expressions array passed to \ref hs_compile_multi */
-    const unsigned index;
-
-    const ReportID id; //!< user-specified pattern ID
-    u64a min_offset;   //!< 0 if not used
-    u64a max_offset;   //!< MAX_OFFSET if not used
-    u64a min_length;   //!< 0 if not used
+    /** \brief Built Glushkov NFA graph. */
+    std::unique_ptr<NGHolder> g;
 };

 /**
@ -94,12 +92,12 @@ public:
 * @param ext
 *      Struct containing extra parameters for this expression, or NULL if
 *      none.
- * @param actionId
+ * @param report
 *      The identifier to associate with the expression; returned by engine on
 *      match.
 */
 void addExpression(NG &ng, unsigned index, const char *expression,
-                   unsigned flags, const hs_expr_ext *ext, ReportID actionId);
+                   unsigned flags, const hs_expr_ext *ext, ReportID report);

 /**
 * Build a Hyperscan database out of the expressions we've been given. A
@ -127,9 +125,8 @@ struct hs_database *build(NG &ng, unsigned int *length);
 * @return
 *      nullptr on error.
 */
-std::unique_ptr<NGWrapper> buildWrapper(ReportManager &rm,
-                                        const CompileContext &cc,
-                                        const ParsedExpression &expr);
+BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
+                           const ParsedExpression &expr);

 /**
 * Build a platform_t out of a target_t.
--- a/src/compiler/expression_info.h
+++ b/src/compiler/expression_info.h
@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief ExpressionInfo class for storing the properties of an expression.
+ */
+
+#ifndef COMPILER_EXPRESSION_INFO_H
+#define COMPILER_EXPRESSION_INFO_H
+
+#include "ue2common.h"
+#include "som/som.h"
+
+namespace ue2 {
+
+/** \brief Properties of an expression. */
+class ExpressionInfo {
+public:
+    ExpressionInfo(unsigned int index_in, bool allow_vacuous_in,
+                   bool highlander_in, bool utf8_in, bool prefilter_in,
+                   som_type som_in, ReportID report_in, u64a min_offset_in,
+                   u64a max_offset_in, u64a min_length_in, u32 edit_distance_in)
+        : index(index_in), report(report_in), allow_vacuous(allow_vacuous_in),
+          highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in),
+          som(som_in), min_offset(min_offset_in), max_offset(max_offset_in),
+          min_length(min_length_in), edit_distance(edit_distance_in) {}
+
+    /**
+     * \brief Index of the expression represented by this graph.
+     *
+     * Used:
+     * - down the track in error handling;
+     * - for identifying parts of an expression in highlander mode.
+     */
+    unsigned int index;
+
+    /** \brief Report ID specified by the user. */
+    ReportID report;
+
+    /** \brief Vacuous pattern is allowed. (HS_FLAG_ALLOWEMPTY) */
+    bool allow_vacuous;
+
+    /** \brief "Highlander" (single match) pattern. (HS_FLAG_SINGLEMATCH) */
+    bool highlander;
+
+    /** \brief UTF-8 pattern. (HS_FLAG_UTF8) */
+    bool utf8;
+
+    /** \brief Prefiltering pattern. (HS_FLAG_PREFILTER) */
+    bool prefilter;
+
+    /** \brief Start-of-match type requested, or SOM_NONE. */
+    som_type som;
+
+    /** \brief Minimum match offset extended parameter. 0 if not used. */
+    u64a min_offset;
+
+    /**
+     * \brief Maximum match offset extended parameter.
+     * MAX_OFFSET if not used.
+     */
+    u64a max_offset;
+
+    /** \brief Minimum match length extended parameter. 0 if not used. */
+    u64a min_length;
+
+    /**
+     * \brief Approximate matching edit distance extended parameter.
+     * 0 if not used.
+     */
+    u32 edit_distance;
+};
+
+}
+
+#endif // COMPILER_EXPRESSION_INFO_H
--- a/src/crc32.c
+++ b/src/crc32.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,14 +29,10 @@
 #include "crc32.h"
 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"

-#if defined(HAVE_C_X86INTRIN_H)
-#include <x86intrin.h>
-#elif defined(HAVE_C_INTRIN_H)
-#include <intrin.h>
-#endif
-
-#ifndef __SSE4_2__
+#if !defined(HAVE_SSE42)

 /***
 *** What follows is derived from Intel's Slicing-by-8 CRC32 impl, which is BSD
@ -582,7 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
    return crc;
 }

-#else // __SSE4_2__
+#else // HAVE_SSE42

 #ifdef ARCH_64_BIT
 #define CRC_WORD 8
@ -638,7 +634,7 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,

 // Externally visible function
 u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
-#ifdef __SSE4_2__
+#if defined(HAVE_SSE42)
    u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen);
 #else
    u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen);
--- a/src/database.c
+++ b/src/database.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -49,7 +49,7 @@ int db_correctly_aligned(const void *db) {
 }

 HS_PUBLIC_API
-hs_error_t hs_free_database(hs_database_t *db) {
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db) {
    if (db && db->magic != HS_DB_MAGIC) {
        return HS_INVALID;
    }
@ -59,8 +59,8 @@ hs_error_t hs_free_database(hs_database_t *db) {
 }

 HS_PUBLIC_API
-hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
-                                 size_t *serialized_length) {
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *serialized_length) {
    if (!db || !bytes || !serialized_length) {
        return HS_INVALID;
    }
@ -114,7 +114,8 @@ hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
 static
 hs_error_t db_check_platform(const u64a p) {
    if (p != hs_current_platform
-        && p != hs_current_platform_no_avx2) {
+        && p != hs_current_platform_no_avx2
+        && p != hs_current_platform_no_avx512) {
        return HS_DB_PLATFORM_ERROR;
    }
    // passed all checks
@ -195,8 +196,9 @@ void db_copy_bytecode(const char *serialized, hs_database_t *db) {
 }

 HS_PUBLIC_API
-hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
-                                      hs_database_t *db) {
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db) {
    if (!bytes || !db) {
        return HS_INVALID;
    }
@ -237,8 +239,9 @@ hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
 }

 HS_PUBLIC_API
-hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
-                                   hs_database_t **db) {
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db) {
    if (!bytes || !db) {
        return HS_INVALID;
    }
@ -286,7 +289,7 @@ hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
 }

 HS_PUBLIC_API
-hs_error_t hs_database_size(const hs_database_t *db, size_t *size) {
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *db, size_t *size) {
    if (!size) {
        return HS_INVALID;
    }
@ -301,8 +304,9 @@ hs_error_t hs_database_size(const hs_database_t *db, size_t *size) {
 }

 HS_PUBLIC_API
-hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
-                                       size_t *size) {
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *size) {
    // Decode and check the header
    hs_database_t header;
    hs_error_t ret = db_decode_header(&bytes, length, &header);
@ -366,7 +370,9 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
    u8 minor = (version >> 16) & 0xff;
    u8 major = (version >> 24) & 0xff;

-    const char *avx2 = (plat & HS_PLATFORM_NOAVX2)  ? "NOAVX2" : " AVX2";
+    const char *features = (plat & HS_PLATFORM_NOAVX512)
+                               ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
+                               : "AVX512";

    const char *mode = NULL;

@ -395,7 +401,7 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
        // that don't have snprintf but have a workalike.
        int p_len = SNPRINTF_COMPAT(
            buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
-            major, minor, release, avx2, mode);
+            major, minor, release, features, mode);
        if (p_len < 0) {
            DEBUG_PRINTF("snprintf output error, returned %d\n", p_len);
            hs_misc_free(buf);
@ -414,8 +420,8 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
 }

 HS_PUBLIC_API
-hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
-                                       char **info) {
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info) {
    if (!info) {
        return HS_INVALID;
    }
@ -434,7 +440,7 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
 }

 HS_PUBLIC_API
-hs_error_t hs_database_info(const hs_database_t *db, char **info) {
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *db, char **info) {
    if (!info) {
        return HS_INVALID;
    }
--- a/src/database.h
+++ b/src/database.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -41,6 +41,7 @@ extern "C"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_version.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #define HS_DB_VERSION HS_VERSION_32BIT
 #define HS_DB_MAGIC   (0xdbdbdbdbU)
@ -53,14 +54,18 @@ extern "C"
 #define HS_PLATFORM_CPU_MASK        0x3F

 #define HS_PLATFORM_NOAVX2          (4<<13)
+#define HS_PLATFORM_NOAVX512        (8<<13)

 /** \brief Platform features bitmask. */
 typedef u64a platform_t;

 static UNUSED
 const platform_t hs_current_platform = {
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    HS_PLATFORM_NOAVX2 |
+#endif
+#if !defined(HAVE_AVX512)
+    HS_PLATFORM_NOAVX512 |
 #endif
    0,
 };
@ -68,6 +73,13 @@ const platform_t hs_current_platform = {
 static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
    HS_PLATFORM_NOAVX2 |
+    HS_PLATFORM_NOAVX512 |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512 = {
+    HS_PLATFORM_NOAVX512 |
    0,
 };

--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,8 +33,14 @@
 #include "util/cpuid_flags.h"
 #include "util/join.h"

+#if defined(DISABLE_AVX512_DISPATCH)
+#define avx512_ disabled_
+#define check_avx512() (0)
+#endif
+
 #define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
    /* create defns */                                                         \
+    RTYPE JOIN(avx512_, NAME)(__VA_ARGS__);                                    \
    RTYPE JOIN(avx2_, NAME)(__VA_ARGS__);                                      \
    RTYPE JOIN(corei7_, NAME)(__VA_ARGS__);                                    \
    RTYPE JOIN(core2_, NAME)(__VA_ARGS__);                                     \
@ -46,6 +52,9 @@
                                                                               \
    /* resolver */                                                             \
    static void(*JOIN(resolve_, NAME)(void)) {                                 \
+        if (check_avx512()) {                                                  \
+            return JOIN(avx512_, NAME);                                        \
+        }                                                                      \
        if (check_avx2()) {                                                    \
            return JOIN(avx2_, NAME);                                          \
        }                                                                      \
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -38,29 +38,19 @@ class EngineDescription {
    u32 id;
    target_t code_target; // the target that we built this code for
    u32 numBuckets;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;

 public:
    EngineDescription(u32 id_in, const target_t &code_target_in,
-                      u32 numBuckets_in, u32 confirmPullBackDistance_in,
-                      u32 confirmTopLevelSplit_in)
-        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in),
-          confirmPullBackDistance(confirmPullBackDistance_in),
-          confirmTopLevelSplit(confirmTopLevelSplit_in) {}
+                      u32 numBuckets_in)
+        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in) {}

    virtual ~EngineDescription();

    u32 getID() const { return id; }
    u32 getNumBuckets() const { return numBuckets; }
-    u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
-    u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
-    void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }

    bool isValidOnTarget(const target_t &target_in) const;
    virtual u32 getDefaultFloodSuffixLength() const = 0;
-
-    virtual bool typicallyHoldsOneCharLits() const { return true; }
 };

 /** Returns a target given a CPU feature set value. */
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,7 +34,9 @@
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"

 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
@ -51,7 +53,7 @@
 *
 * The incoming buffer is to split in multiple zones to ensure two properties:
 * 1: that we can read 8? bytes behind to generate a hash safely
- * 2: that we can read the byte after the current byte (domain > 8)
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
 */
 struct zone {
    /** \brief copied buffer, used only when it is a boundary zone. */
@ -116,20 +118,34 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };

+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    u64a r;
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+#else
+    r = unaligned_load_u32(b) & ~a;
+#endif
+    return r;
+}
+
 /* generates an initial state mask based on the last byte-ish of history rather
 * than being all accepting. If there is no history to consider, the state is
 * generated based on the minimum length of each bucket in order to prevent
 * confirms.
 */
 static really_inline
-m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
+m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
                  const struct zone *z) {
    m128 s;
    if (len_history) {
        /* +1: the zones ensure that we can read the byte at z->end */
        u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
        tmp &= fdr->domainMask;
-        s = *((const m128 *)ft + tmp);
+        s = load_m128_from_u64a(ft + tmp);
        s = rshiftbyte_m128(s, 1);
    } else {
        s = fdr->start;
@ -138,51 +154,30 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
 }

 static really_inline
-void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
+void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    /* +1: the zones ensure that we can read the byte at z->end */
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach3 = andn(domain_mask_flipped, itPtr + 3);

-    u64a current_data_0;
-    u64a current_data_8;
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st3 = load_m128_from_u64a(ft + reach3);

-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) &
-               domain_mask_adjusted;
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted;
-    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
-    u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted;
-    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) &
-               domain_mask_adjusted;
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted;
-    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
-    u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-    u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted;
-    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+    u64a reach7 = andn(domain_mask_flipped, itPtr + 7);

-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st1 = *(const m128 *)(ft + v1*8);
-    m128 st2 = *(const m128 *)(ft + v2*8);
-    m128 st3 = *(const m128 *)(ft + v3*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st5 = *(const m128 *)(ft + v5*8);
-    m128 st6 = *(const m128 *)(ft + v6*8);
-    m128 st7 = *(const m128 *)(ft + v7*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st9 = *(const m128 *)(ft + v9*8);
-    m128 st10 = *(const m128 *)(ft + v10*8);
-    m128 st11 = *(const m128 *)(ft + v11*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
-    m128 st13 = *(const m128 *)(ft + v13*8);
-    m128 st14 = *(const m128 *)(ft + v14*8);
-    m128 st15 = *(const m128 *)(ft + v15*8);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st7 = load_m128_from_u64a(ft + reach7);

    st1 = lshiftbyte_m128(st1, 1);
    st2 = lshiftbyte_m128(st2, 2);
@ -191,6 +186,40 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
    st5 = lshiftbyte_m128(st5, 5);
    st6 = lshiftbyte_m128(st6, 6);
    st7 = lshiftbyte_m128(st7, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+    *s = or128(*s, st0);
+
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 ^= ~0ULL;
+
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+    u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
+
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
    st9 = lshiftbyte_m128(st9, 1);
    st10 = lshiftbyte_m128(st10, 2);
    st11 = lshiftbyte_m128(st11, 3);
@ -199,100 +228,86 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
    st14 = lshiftbyte_m128(st14, 6);
    st15 = lshiftbyte_m128(st15, 7);

-    *s = or128(*s, st0);
-    *s = or128(*s, st1);
-    *s = or128(*s, st2);
-    *s = or128(*s, st3);
-    *s = or128(*s, st4);
-    *s = or128(*s, st5);
-    *s = or128(*s, st6);
-    *s = or128(*s, st7);
-    *conf0 = movq(*s);
-    *s = rshiftbyte_m128(*s, 8);
-    *conf0 ^= ~0ULL;
-
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
    *s = or128(*s, st8);
-    *s = or128(*s, st9);
-    *s = or128(*s, st10);
-    *s = or128(*s, st11);
-    *s = or128(*s, st12);
-    *s = or128(*s, st13);
-    *s = or128(*s, st14);
-    *s = or128(*s, st15);
+
    *conf8 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf8 ^= ~0ULL;
 }

 static really_inline
-void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
-    u64a current_data_0;
-    u64a current_data_8;
+void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);

-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st6 = load_m128_from_u64a(ft + reach6);

-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st2 = *(const m128 *)(ft + v2*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st6 = *(const m128 *)(ft + v6*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st10 = *(const m128 *)(ft + v10*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
-    m128 st14 = *(const m128 *)(ft + v14*8);
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st14 = load_m128_from_u64a(ft + reach14);

    st2  = lshiftbyte_m128(st2, 2);
    st4  = lshiftbyte_m128(st4, 4);
    st6  = lshiftbyte_m128(st6, 6);
-    st10 = lshiftbyte_m128(st10, 2);
-    st12 = lshiftbyte_m128(st12, 4);
-    st14 = lshiftbyte_m128(st14, 6);

    *s = or128(*s, st0);
    *s = or128(*s, st2);
    *s = or128(*s, st4);
    *s = or128(*s, st6);
+
    *conf0 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf0 ^= ~0ULL;

+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
+
    *s = or128(*s, st8);
    *s = or128(*s, st10);
    *s = or128(*s, st12);
    *s = or128(*s, st14);
+
    *conf8 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf8 ^= ~0ULL;
 }

 static really_inline
-void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
-    u64a current_data_0;
-    u64a current_data_8;
+void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);

-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-
-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);

    st4 = lshiftbyte_m128(st4, 4);
    st12 = lshiftbyte_m128(st12, 4);
@ -315,7 +330,6 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
    const u8 bucket = 8;
-    const u8 pullback = 1;

    if (likely(!*conf)) {
        return;
@ -332,8 +346,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
-        u32 confSplit = *(ptr + byte);
-        u32 idx = confSplit * bucket + bitRem;
+        u32 idx = bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
@ -343,18 +356,8 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
        if (!(fdrc->groups & *control)) {
            continue;
        }
-        if (!fdrc->mult) {
-            u32 id = fdrc->nBitsOrSoleID;
-            if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) {
-                continue;
-            }
-           *last_match_id = id;
-           *control = a->cb(ptr_main + byte - a->buf, ptr_main + byte - a->buf,
-                            id, a->ctxt);
-           continue;
-        }
-        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
-        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback, control,
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                    last_match_id, confVal);
    } while (unlikely(!!*conf));
 }
@ -496,6 +499,7 @@ void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,

    /* copy the post-padding byte; this is required for domain > 8 due to
     * overhang */
+    assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
    *z_end = 0;

    z->end = z_end;
@ -566,15 +570,19 @@ void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));

    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+
+    assert(ZONE_START_BEGIN + copy_len + 3 < 64);
 }

 /**
 * \brief Create a zone for the end region.
 *
 * This function requires that there is > ITER_BYTES of data in the buffer to
- * scan. The end zone, however, is only responsible for a scanning the <=
- * ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES
- * iteration as the main loop cannot handle the last byte of the buffer.
+ * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
+ * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
+ * of the buffer. The end zone is required to handle an optional full
+ * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
+ * main zone size is reduced by ITER_BYTES in this case.
 *
 * This zone ensures that the byte at z->end can be read by filling it with a
 * padding character.
@ -592,31 +600,45 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,

    ptrdiff_t z_len = end - begin;
    assert(z_len > 0);
-    assert(z_len <= ITER_BYTES);
+    size_t iter_bytes_second = 0;
+    size_t z_len_first = z_len;
+    if (z_len > ITER_BYTES) {
+        z_len_first = z_len - ITER_BYTES;
+        iter_bytes_second = ITER_BYTES;
+    }
+    z->shift = ITER_BYTES - z_len_first;

-    z->shift = ITER_BYTES - z_len;
+    const u8 *end_first = end - iter_bytes_second;
+    /* The amount of data we have to copy from main buffer for the
+     * first iteration. */
+    size_t copy_len_first = MIN((size_t)(end_first - buf),
+                                ITER_BYTES + sizeof(CONF_TYPE));
+    assert(copy_len_first >= 16);

-    /* The amount of data we have to copy from main buffer. */
-    size_t copy_len = MIN((size_t)(end - buf),
-                          ITER_BYTES + sizeof(CONF_TYPE));
-    assert(copy_len >= 16);
+    size_t total_copy_len = copy_len_first + iter_bytes_second;
+    assert(total_copy_len + 3 < 64);

    /* copy the post-padding byte; this is required for domain > 8 due to
     * overhang */
-    z->buf[copy_len] = 0;
+    z->buf[total_copy_len] = 0;

    /* set the start and end location of the zone buf
     * to be scanned */
-    u8 *z_end = z->buf + copy_len;
+    u8 *z_end = z->buf + total_copy_len;
    z->end = z_end;
-    z->start = z_end - ITER_BYTES;
+    z->start = z_end - ITER_BYTES - iter_bytes_second;
    assert(z->start + z->shift == z_end - z_len);

+    u8 *z_end_first = z_end - iter_bytes_second;
    /* copy the first 8 bytes of the valid region */
-    unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len));
+    unaligned_store_u64a(z->buf,
+                         unaligned_load_u64a(end_first - copy_len_first));

    /* copy the last 16 bytes, may overlap with the previous 8 byte write */
-    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+    storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
+    if (iter_bytes_second) {
+        storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+    }

    z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
 }
@ -651,13 +673,13 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,

    /* find maximum buffer location that the main zone can scan
     * - must be a multiple of ITER_BYTES, and
-     * - cannot contain the last byte (due to overhang)
+     * - cannot contain the last 3 bytes (due to 3 bytes read behind the
+         end of buffer in FDR main loop)
     */
-    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES);
-    assert(main_end >= ptr);
+    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);

    /* create a zone if multiple of ITER_BYTES are found */
-    if (main_end != ptr) {
+    if (main_end > ptr) {
        createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
        ptr = main_end;
    }
@ -684,10 +706,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
                    return HWLM_TERMINATED;                                 \
                }                                                           \
            }                                                               \
-            __builtin_prefetch(itPtr + (ITER_BYTES*4));                     \
+            __builtin_prefetch(itPtr + ITER_BYTES);                         \
            u64a conf0;                                                     \
            u64a conf8;                                                     \
-            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted,    \
+            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
                        ft, &conf0, &conf8, &s);                            \
            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
                           &last_match_id, zz);                             \
@ -705,10 +727,11 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                             hwlm_group_t control) {
    u32 floodBackoff = FLOOD_BACKOFF_START;
    u32 last_match_id = INVALID_MATCH_ID;
-    u64a domain_mask_adjusted = fdr->domainMask << 1;
+    u32 domain_mask_flipped = ~fdr->domainMask;
    u8 stride = fdr->stride;
-    const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 *confBase = (const u32 *)(ft + fdr->tabSize);
+    const u64a *ft =
+        (const u64a *)((const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)));
+    const u32 *confBase = (const u32 *)((const u8 *)ft + fdr->tabSize);
    struct zone zones[ZONE_MAX];
    assert(fdr->domain > 8 && fdr->domain < 16);

@ -761,7 +784,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
    return HWLM_SUCCESS;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 #define ONLY_AVX2(func) func
 #else
 #define ONLY_AVX2(func) NULL
@ -773,8 +796,8 @@ typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,

 static const FDRFUNCTYPE funcs[] = {
    fdr_engine_exec,
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
+    NULL, /* old: fast teddy */
+    NULL, /* old: fast teddy */
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,8 +30,9 @@
 * \brief FDR literal matcher: build API.
 */

-#include "fdr_internal.h"
 #include "fdr_compile.h"
+
+#include "fdr_internal.h"
 #include "fdr_confirm.h"
 #include "fdr_compile_internal.h"
 #include "fdr_engine_description.h"
@ -40,9 +41,10 @@
 #include "grey.h"
 #include "ue2common.h"
 #include "hwlm/hwlm_build.h"
-#include "util/alloc.h"
 #include "util/compare.h"
 #include "util/dump_mask.h"
+#include "util/math.h"
+#include "util/noncopyable.h"
 #include "util/target_info.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
@ -53,13 +55,15 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <vector>

-#include <boost/core/noncopyable.hpp>
+#include <boost/multi_array.hpp>

 using namespace std;

@ -67,31 +71,31 @@ namespace ue2 {

 namespace {

-class FDRCompiler : boost::noncopyable {
+class FDRCompiler : noncopyable {
 private:
    const FDREngineDescription &eng;
+    const Grey &grey;
    vector<u8> tab;
-    const vector<hwlmLiteral> &lits;
+    vector<hwlmLiteral> lits;
    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
    bool make_small;

    u8 *tabIndexToMask(u32 indexInTable);
-    void assignStringToBucket(LiteralIndex l, BucketIndex b);
    void assignStringsToBuckets();
 #ifdef DEBUG
    void dumpMasks(const u8 *defaultMask);
 #endif
    void setupTab();
-    aligned_unique_ptr<FDR> setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> setupFDR();
    void createInitialState(FDR *fdr);

 public:
-    FDRCompiler(const vector<hwlmLiteral> &lits_in,
-                const FDREngineDescription &eng_in, bool make_small_in)
-        : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
-          make_small(make_small_in) {}
+    FDRCompiler(vector<hwlmLiteral> lits_in, const FDREngineDescription &eng_in,
+                bool make_small_in, const Grey &grey_in)
+        : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
+          lits(move(lits_in)), make_small(make_small_in) {}

-    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> build();
 };

 u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
@ -140,27 +144,25 @@ void FDRCompiler::createInitialState(FDR *fdr) {
    }
 }

-aligned_unique_ptr<FDR>
-FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> FDRCompiler::setupFDR() {
    size_t tabSize = eng.getTabSizeBytes();

-    auto floodControlTmp = setupFDRFloodControl(lits, eng);
-    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);

    assert(ISALIGNED_16(tabSize));
-    assert(ISALIGNED_16(confirmTmp.second));
-    assert(ISALIGNED_16(floodControlTmp.second));
-    assert(ISALIGNED_16(link.second));
+    assert(ISALIGNED_16(confirmTmp.size()));
+    assert(ISALIGNED_16(floodControlTmp.size()));
    size_t headerSize = ROUNDUP_16(sizeof(FDR));
-    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.second +
-                             floodControlTmp.second + link.second);
+    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.size() +
+                             floodControlTmp.size());

    DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
                 "total=%zu\n",
-                 headerSize, tabSize, confirmTmp.second, floodControlTmp.second,
+                 headerSize, tabSize, confirmTmp.size(), floodControlTmp.size(),
                 size);

-    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
    assert(fdr); // otherwise would have thrown std::bad_alloc

    fdr->size = size;
@ -169,16 +171,16 @@ FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
    createInitialState(fdr.get());

    u8 *fdr_base = (u8 *)fdr.get();
-    u8 * ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
+    u8 *ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
    copy(tab.begin(), tab.end(), ptr);
    ptr += tabSize;

-    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
-    ptr += confirmTmp.second;
+    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
+    ptr += confirmTmp.size();

    fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
-    ptr += floodControlTmp.second;
+    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
+    ptr += floodControlTmp.size();

    /*  we are allowing domains 9 to 15 only */
    assert(eng.bits > 8 && eng.bits < 16);
@ -187,76 +189,124 @@ FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
    fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
    fdr->stride = eng.stride;

-    if (link.first) {
-        fdr->link = verify_u32(ptr - fdr_base);
-        memcpy(ptr, link.first.get(), link.second);
-    } else {
-        fdr->link = 0;
-    }
-
    return fdr;
 }

-void FDRCompiler::assignStringToBucket(LiteralIndex l, BucketIndex b) {
-    bucketToLits[b].push_back(l);
+//#define DEBUG_ASSIGNMENT
+
+static
+double getScoreUtil(u32 len, u32 count) {
+    return len == 0 ? numeric_limits<double>::max()
+                    : our_pow(count, 1.05) * our_pow(len, -3.0);
 }

-struct LitOrder {
-    explicit LitOrder(const vector<hwlmLiteral> &vl_) : vl(vl_) {}
-    bool operator()(const u32 &i1, const u32 &i2) const {
-        const string &i1s = vl[i1].s;
-        const string &i2s = vl[i2].s;
+/**
+ * Returns true if the two given literals should be placed in the same chunk as
+ * they are identical except for a difference in caselessness.
+ */
+static
+bool isEquivLit(const hwlmLiteral &a, const hwlmLiteral &b,
+                const hwlmLiteral *last_nocase_lit) {
+    const size_t a_len = a.s.size();
+    const size_t b_len = b.s.size();

-        size_t len1 = i1s.size(), len2 = i2s.size();
+    if (a_len != b_len) {
+        return false;
+    }

-        if (len1 != len2) {
-            return len1 < len2;
-        } else {
-            auto p = std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
-            if (p.first == i1s.rend()) {
-                return false;
+    bool nocase = last_nocase_lit && a_len == last_nocase_lit->s.size() &&
+                  !cmp(a.s.c_str(), last_nocase_lit->s.c_str(), a_len, true);
+    return !cmp(a.s.c_str(), b.s.c_str(), a.s.size(), nocase);
+}
+
+struct Chunk {
+    Chunk(u32 first_id_in, u32 count_in, u32 length_in)
+        : first_id(first_id_in), count(count_in), length(length_in) {}
+    u32 first_id; //!< first id in this chunk
+    u32 count;    //!< how many are in this chunk
+    u32 length;   //!< how long things in the chunk are
+};
+
+static
+vector<Chunk> assignChunks(const vector<hwlmLiteral> &lits,
+                           const map<u32, u32> &lenCounts) {
+    const u32 CHUNK_MAX = 512;
+    const u32 MAX_CONSIDERED_LENGTH = 16;
+
+    // TODO: detailed early stage literal analysis for v. small cases (actually
+    // look at lits) yes - after we factor this out and merge in the Teddy
+    // style of building we can look at this, although the teddy merge
+    // modelling is quite different. It's still probably adaptable to some
+    // extent for this class of problem.
+
+    vector<Chunk> chunks;
+    chunks.reserve(CHUNK_MAX);
+
+    const u32 maxPerChunk = lits.size() /
+            (CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
+
+    u32 currentSize = 0;
+    u32 chunkStartID = 0;
+    const hwlmLiteral *last_nocase_lit = nullptr;
+
+    for (u32 i = 0; i < lits.size() && chunks.size() < CHUNK_MAX - 1; i++) {
+        const auto &lit = lits[i];
+
+        DEBUG_PRINTF("i=%u, lit=%s%s\n", i, escapeString(lit.s).c_str(),
+                      lit.nocase ? " (nocase)" : "");
+
+        // If this literal is identical to the last one (aside from differences
+        // in caselessness), keep going even if we will "overfill" a chunk; we
+        // don't want to split identical literals into different buckets.
+        if (i != 0 && isEquivLit(lit, lits[i - 1], last_nocase_lit)) {
+            DEBUG_PRINTF("identical lit\n");
+            goto next_literal;
+        }
+
+        if ((currentSize < MAX_CONSIDERED_LENGTH &&
+             (lit.s.size() != currentSize)) ||
+            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
+            currentSize = lit.s.size();
+            if (!chunks.empty()) {
+                chunks.back().count = i - chunkStartID;
            }
-            return *p.first < *p.second;
+            chunkStartID = i;
+            chunks.emplace_back(i, 0, currentSize);
+        }
+next_literal:
+        if (lit.nocase) {
+            last_nocase_lit = &lit;
        }
    }

-private:
-    const vector<hwlmLiteral> &vl;
-};
+    assert(!chunks.empty());
+    chunks.back().count = lits.size() - chunkStartID;
+    // close off chunks with an empty row
+    chunks.emplace_back(lits.size(), 0, 0);

-static u64a getScoreUtil(u32 len, u32 count) {
-    if (len == 0) {
-        return (u64a)-1;
+#ifdef DEBUG_ASSIGNMENT
+    for (size_t j = 0; j < chunks.size(); j++) {
+        const auto &chunk = chunks[j];
+        printf("chunk %zu first_id=%u count=%u length=%u\n", j, chunk.first_id,
+               chunk.count, chunk.length);
    }
-    const u32 LEN_THRESH = 128;
-    const u32 elen = (len > LEN_THRESH) ? LEN_THRESH : len;
-    const u64a lenScore =
-        (LEN_THRESH * LEN_THRESH * LEN_THRESH) / (elen * elen * elen);
-    return count * lenScore; // deemphasize count - possibly more than needed
-                             // this might be overkill in the other direction
+#endif
+
+    DEBUG_PRINTF("built %zu chunks (%zu lits)\n", chunks.size(), lits.size());
+    assert(chunks.size() <= CHUNK_MAX);
+    return chunks;
 }

-//#define DEBUG_ASSIGNMENT
 void FDRCompiler::assignStringsToBuckets() {
-    typedef u64a SCORE; // 'Score' type
-    const SCORE MAX_SCORE = (SCORE)-1;
-    const u32 CHUNK_MAX = 512;
-    const u32 BUCKET_MAX = 16;
-    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
+    const double MAX_SCORE = numeric_limits<double>::max();

-    u32 ls = verify_u32(lits.size());
-    assert(ls); // Shouldn't be called with no literals.
+    assert(!lits.empty()); // Shouldn't be called with no literals.

-    // make a vector that contains our literals as pointers or u32 LiteralIndex values
-    vector<LiteralIndex> vli;
-    vli.resize(ls);
+    // Count the number of literals for each length.
    map<u32, u32> lenCounts;
-    for (LiteralIndex l = 0; l < ls; l++) {
-        vli[l] = l;
-        lenCounts[lits[l].s.size()]++;
+    for (const auto &lit : lits) {
+        lenCounts[lit.s.size()]++;
    }
-    // sort vector by literal length + if tied on length, 'magic' criteria of some kind (tbd)
-    stable_sort(vli.begin(), vli.end(), LitOrder(lits));

 #ifdef DEBUG_ASSIGNMENT
    for (const auto &m : lenCounts) {
@ -265,103 +315,94 @@ void FDRCompiler::assignStringsToBuckets() {
    printf("\n");
 #endif

-    // TODO: detailed early stage literal analysis for v. small cases (actually look at lits)
-    // yes - after we factor this out and merge in the Teddy style of building we can look
-    // at this, although the teddy merge modelling is quite different. It's still probably
-    // adaptable to some extent for this class of problem
+    // Sort literals by literal length. If tied on length, use lexicographic
+    // ordering (of the reversed literals).
+    stable_sort(lits.begin(), lits.end(),
+                [](const hwlmLiteral &a, const hwlmLiteral &b) {
+                    if (a.s.size() != b.s.size()) {
+                        return a.s.size() < b.s.size();
+                    }
+                    auto p = mismatch(a.s.rbegin(), a.s.rend(), b.s.rbegin());
+                    if (p.first != a.s.rend()) {
+                        return *p.first < *p.second;
+                    }
+                    // Sort caseless variants first.
+                    return a.nocase > b.nocase;
+                });

-    u32 firstIds[CHUNK_MAX]; // how many are in this chunk (CHUNK_MAX - 1 contains 'last' bound)
-    u32 count[CHUNK_MAX]; // how many are in this chunk
-    u32 length[CHUNK_MAX]; // how long things in the chunk are
+    vector<Chunk> chunks = assignChunks(lits, lenCounts);

-    const u32 MAX_CONSIDERED_LENGTH = 16;
-    u32 currentChunk = 0;
-    u32 currentSize = 0;
-    u32 chunkStartID = 0;
-    u32 maxPerChunk  = ls/(CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
+    const u32 numChunks = chunks.size();
+    const u32 numBuckets = eng.getNumBuckets();

-    for (u32 i = 0; i < ls && currentChunk < CHUNK_MAX - 1; i++) {
-        LiteralIndex l = vli[i];
-        if ((currentSize < MAX_CONSIDERED_LENGTH && (lits[l].s.size() != currentSize)) ||
-            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
-            currentSize = lits[l].s.size();
-            if (currentChunk) {
-                count[currentChunk - 1 ] = i - chunkStartID;
-            }
-            chunkStartID = firstIds[currentChunk] = i;
-            length[currentChunk] = currentSize;
-            currentChunk++;
-        }
-    }
+    // 2D array of (score, chunk index) pairs, indexed by
+    // [chunk_index][bucket_index].
+    boost::multi_array<pair<double, u32>, 2> t(
+        boost::extents[numChunks][numBuckets]);

-    assert(currentChunk > 0);
-    count[currentChunk - 1] = ls - chunkStartID;
-    // close off chunks with an empty row
-    firstIds[currentChunk] = ls;
-    length[currentChunk] = 0;
-    count[currentChunk] = 0;
-    u32 nChunks = currentChunk + 1;
-
-#ifdef DEBUG_ASSIGNMENT
-    for (u32 j = 0; j < nChunks; j++) {
-        printf("%d %d %d %d\n", j, firstIds[j], count[j], length[j]);
-    }
-#endif
-
-    SCORE_INDEX_PAIR t[CHUNK_MAX][BUCKET_MAX]; // pair of score, index
-    u32 nb = eng.getNumBuckets();
-
-    for (u32 j = 0; j < nChunks; j++) {
+    for (u32 j = 0; j < numChunks; j++) {
        u32 cnt = 0;
-        for (u32 k = j; k < nChunks; ++k) {
-            cnt += count[k];
+        for (u32 k = j; k < numChunks; ++k) {
+            cnt += chunks[k].count;
        }
-        t[j][0] = {getScoreUtil(length[j], cnt), 0};
+        t[j][0] = {getScoreUtil(chunks[j].length, cnt), 0};
    }

-    for (u32 i = 1; i < nb; i++) {
-        for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
-            SCORE_INDEX_PAIR best = {MAX_SCORE, 0};
-            u32 cnt = count[j];
-            for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
-                SCORE score = getScoreUtil(length[j], cnt);
+    for (u32 i = 1; i < numBuckets; i++) {
+        for (u32 j = 0; j < numChunks - 1; j++) { // don't do last, empty row
+            pair<double, u32> best = {MAX_SCORE, 0};
+            u32 cnt = chunks[j].count;
+            for (u32 k = j + 1; k < numChunks - 1; k++) {
+                auto score = getScoreUtil(chunks[j].length, cnt);
                if (score > best.first) {
-                    break; // if we're now worse locally than our best score, give up
+                    break; // now worse locally than our best score, give up
                }
                score += t[k][i-1].first;
                if (score < best.first) {
                    best = {score, k};
                }
+                cnt += chunks[k].count;
            }
            t[j][i] = best;
        }
-        t[nChunks - 1][i] = {0,0}; // fill in empty final row for next iteration
+        t[numChunks - 1][i] = {0,0}; // fill in empty final row for next iter
    }

 #ifdef DEBUG_ASSIGNMENT
-    for (u32 j = 0; j < nChunks; j++) {
-        for (u32 i = 0; i < nb; i++) {
-            SCORE_INDEX_PAIR v = t[j][i];
-            printf("<%7lld,%3d>", v.first, v.second);
+    for (u32 j = 0; j < numChunks; j++) {
+        printf("%03u: ", j);
+        for (u32 i = 0; i < numBuckets; i++) {
+            const auto &v = t[j][i];
+            printf("<%0.3f,%3d> ", v.first, v.second);
        }
        printf("\n");
    }
 #endif

-    // our best score is in best[0][N_BUCKETS-1] and we can follow the links
+    // our best score is in t[0][N_BUCKETS-1] and we can follow the links
    // to find where our buckets should start and what goes into them
-    for (u32 i = 0, n = nb; n && (i != nChunks - 1); n--) {
+    for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) {
        u32 j = t[i][n - 1].second;
        if (j == 0) {
-            j = nChunks - 1;
+            j = numChunks - 1;
        }
-        // put chunks between i - j into bucket (NBUCKETS-1) - n
-#ifdef DEBUG_ASSIGNMENT
-        printf("placing from %d to %d in bucket %d\n", firstIds[i], firstIds[j],
-               nb - n);
-#endif
-        for (u32 k = firstIds[i]; k < firstIds[j]; k++) {
-            assignStringToBucket((LiteralIndex)vli[k], nb - n);
+
+        // put chunks between i - j into bucket (numBuckets - n).
+        u32 first_id = chunks[i].first_id;
+        u32 last_id = chunks[j].first_id;
+        assert(first_id < last_id);
+        u32 bucket = numBuckets - n;
+        UNUSED const auto &first_lit = lits[first_id];
+        UNUSED const auto &last_lit = lits[last_id - 1];
+        DEBUG_PRINTF("placing [%u-%u) in bucket %u (%u lits, len %zu-%zu, "
+                      "score %0.4f)\n",
+                      first_id, last_id, bucket, last_id - first_id,
+                      first_lit.s.length(), last_lit.s.length(),
+                      getScoreUtil(first_lit.s.length(), last_id - first_id));
+
+        auto &bucket_lits = bucketToLits[bucket];
+        for (u32 k = first_id; k < last_id; k++) {
+            bucket_lits.push_back(k);
        }
        i = j;
    }
@ -487,49 +528,22 @@ void FDRCompiler::setupTab() {
 #endif
 }

-aligned_unique_ptr<FDR>
-FDRCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> FDRCompiler::build() {
    assignStringsToBuckets();
    setupTab();
-    return setupFDR(link);
+    return setupFDR();
 }

 } // namespace

 static
-size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
-    size_t rv = 0;
-    for (const auto &lit : lits) {
-        rv = max(rv, lit.msk.size());
-    }
-    return rv;
-}
-
-static
-void setHistoryRequired(hwlmStreamingControl &stream_ctl,
-                        const vector<hwlmLiteral> &lits) {
-    size_t max_mask_len = maxMaskLen(lits);
-
-    // we want enough history to manage the longest literal and the longest
-    // mask.
-    stream_ctl.literal_history_required = max(maxLen(lits), max_mask_len) - 1;
-}
-
-static
-aligned_unique_ptr<FDR>
-fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
-                      const target_t &target, const Grey &grey, u32 hint,
-                      hwlmStreamingControl *stream_control) {
-    pair<aligned_unique_ptr<u8>, size_t> link(nullptr, 0);
-
-    if (stream_control) {
-        setHistoryRequired(*stream_control, lits);
-    }
-
+bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
+                                        bool make_small, const target_t &target,
+                                        const Grey &grey, u32 hint) {
    DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");

    if (grey.fdrAllowTeddy) {
-        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, grey);
        if (fdr) {
            DEBUG_PRINTF("build with teddy succeeded\n");
            return fdr;
@ -538,10 +552,8 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
        }
    }

-    const unique_ptr<FDREngineDescription> des =
-        (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
-                               : getFdrDescription(hint);
-
+    auto des = (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
+                                      : getFdrDescription(hint);
    if (!des) {
        return nullptr;
    }
@ -552,27 +564,23 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
        des->stride = 1;
    }

-    FDRCompiler fc(lits, *des, make_small);
-    return fc.build(link);
+    FDRCompiler fc(lits, *des, make_small, grey);
+    return fc.build();
 }

-aligned_unique_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
-                                      bool make_small, const target_t &target,
-                                      const Grey &grey,
-                                      hwlmStreamingControl *stream_control) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID,
-                                 stream_control);
+bytecode_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
+                                bool make_small, const target_t &target,
+                                const Grey &grey) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID);
 }

 #if !defined(RELEASE_BUILD)

-aligned_unique_ptr<FDR>
-fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
-                    const target_t &target, const Grey &grey,
-                    hwlmStreamingControl *stream_control) {
-    pair<u8 *, size_t> link(nullptr, 0);
-    return fdrBuildTableInternal(lits, make_small, target, grey, hint,
-                                 stream_control);
+bytecode_ptr<FDR> fdrBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                      bool make_small, u32 hint,
+                                      const target_t &target,
+                                      const Grey &grey) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, hint);
 }

 #endif
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,7 +34,7 @@
 #define FDR_COMPILE_H

 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"

 #include <vector>

@ -43,21 +43,18 @@ struct FDR;
 namespace ue2 {

 struct hwlmLiteral;
-struct hwlmStreamingControl;
 struct Grey;
 struct target_t;

-ue2::aligned_unique_ptr<FDR>
-fdrBuildTable(const std::vector<hwlmLiteral> &lits, bool make_small,
-              const target_t &target, const Grey &grey,
-              hwlmStreamingControl *stream_control = nullptr);
+bytecode_ptr<FDR> fdrBuildTable(const std::vector<hwlmLiteral> &lits,
+                                bool make_small, const target_t &target,
+                                const Grey &grey);

 #if !defined(RELEASE_BUILD)

-ue2::aligned_unique_ptr<FDR>
-fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
-                    u32 hint, const target_t &target, const Grey &grey,
-                    hwlmStreamingControl *stream_control = nullptr);
+bytecode_ptr<FDR> fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits,
+                                      bool make_small, u32 hint,
+                                      const target_t &target, const Grey &grey);

 #endif

--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,7 +31,7 @@

 #include "ue2common.h"
 #include "hwlm/hwlm_literal.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"

 #include <map>
 #include <utility>
@ -55,21 +55,22 @@ typedef u32 PositionInBucket;  // zero is 'we are matching right now!",
 class EngineDescription;
 class FDREngineDescription;
 struct hwlmStreamingControl;
+struct Grey;

-std::pair<aligned_unique_ptr<u8>, size_t> setupFullMultiConfs(
-    const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
-    std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
-    bool make_small);
+bytecode_ptr<u8> setupFullConfs(const std::vector<hwlmLiteral> &lits,
+               const EngineDescription &eng,
+               std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
+               bool make_small);

 // all suffixes include an implicit max_bucket_width suffix to ensure that
 // we always read a full-scale flood "behind" us in terms of what's in our
 // state; if we don't have a flood that's long enough we won't be in the
 // right state yet to allow blindly advancing
-std::pair<aligned_unique_ptr<u8>, size_t>
-setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
-                     const EngineDescription &eng);
+bytecode_ptr<u8> setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
+                                      const EngineDescription &eng,
+                                      const Grey &grey);

-std::pair<aligned_unique_ptr<u8>, size_t>
+bytecode_ptr<u8>
 fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
                       hwlmStreamingControl &stream_control);

--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -45,10 +45,7 @@ using namespace std;

 namespace ue2 {

-using ConfSplitType = u8;
-using BucketSplitPair = pair<BucketIndex, ConfSplitType>;
-using BC2CONF = map<BucketSplitPair,
-                    pair<aligned_unique_ptr<FDRConfirm>, size_t>>;
+using BC2CONF = map<BucketIndex, bytecode_ptr<FDRConfirm>>;

 // return the number of bytes beyond a length threshold in all strings in lits
 static
@ -150,9 +147,9 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,

 //#define FDR_CONFIRM_DUMP 1

-static pair<aligned_unique_ptr<FDRConfirm>, size_t>
-getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
-              bool make_small, bool make_confirm) {
+static
+bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
+                                       bool make_small, bool make_confirm) {
    vector<LitInfo> tmpLitInfo(lits.size());
    CONF_TYPE andmsk;
    fillLitInfo(lits, tmpLitInfo, andmsk);
@ -166,7 +163,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
    if (make_small) {
        nBits = min(10U, lg2(lits.size()) + 1);
    } else {
-        nBits = min(13U, lg2(lits.size()) + 4);
+        nBits = lg2(lits.size() + 4);
    }

    CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
@ -177,8 +174,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
    u32 soleLitCmp = 0;
    u32 soleLitMsk = 0;

-    if ((applyOneCharOpt && lits.size() == 1 && lits[0].s.size() == 0 &&
-            lits[0].msk.empty()) || make_confirm == false) {
+    if (!make_confirm) {
        flags = FDRC_FLAG_NO_CONFIRM;
        if (lits[0].noruns) {
            flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
@ -288,7 +284,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
                  sizeof(LitInfo) * lits.size() + totalLitSize;
    size = ROUNDUP_N(size, alignof(FDRConfirm));

-    auto fdrc = aligned_zmalloc_unique<FDRConfirm>(size);
+    auto fdrc = make_zeroed_bytecode_ptr<FDRConfirm>(size);
    assert(fdrc); // otherwise would have thrown std::bad_alloc

    fdrc->andmsk = andmsk;
@ -322,32 +318,15 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
            LiteralIndex litIdx = *i;

            // Write LitInfo header.
-            u8 *oldPtr = ptr;
            LitInfo &finalLI = *(LitInfo *)ptr;
            finalLI = tmpLitInfo[litIdx];

            ptr += sizeof(LitInfo); // String starts directly after LitInfo.
-
-            // Write literal prefix (everything before the last N characters,
-            // as the last N are already confirmed).
-            const string &t = lits[litIdx].s;
-            if (t.size() > sizeof(CONF_TYPE)) {
-                size_t prefix_len = t.size() - sizeof(CONF_TYPE);
-                memcpy(ptr, t.c_str(), prefix_len);
-                ptr += prefix_len;
-            }
-
-            ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+            assert(lits[litIdx].s.size() <= sizeof(CONF_TYPE));
            if (next(i) == e) {
                finalLI.next = 0;
            } else {
-                // our next field represents an adjustment on top of
-                // current address + the actual size of the literal
-                // so we track any rounding up done for alignment and
-                // add this in - that way we don't have to use bigger
-                // than a u8 (for now)
-                assert((size_t)(ptr - oldPtr) > t.size());
-                finalLI.next = verify_u8(ptr - oldPtr - t.size());
+                finalLI.next = 1;
            }
        }
        assert((size_t)(ptr - fdrc_base) <= size);
@ -358,19 +337,16 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
    size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
                                   alignof(FDRConfirm));
    assert(actual_size <= size);
+    fdrc.shrink(actual_size);

-    return {move(fdrc), actual_size};
+    return fdrc;
 }

-static
-u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
-                       const EngineDescription &eng, BC2CONF &bc2Conf,
-                       map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
-                       bool make_small) {
-    u32 pullBack = eng.getConfirmPullBackDistance();
-    u32 splitMask = eng.getConfirmTopLevelSplit() - 1;
-    bool splitHasCase = splitMask & 0x20;
-
+bytecode_ptr<u8>
+setupFullConfs(const vector<hwlmLiteral> &lits,
+               const EngineDescription &eng,
+               map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+               bool make_small) {
    bool makeConfirm = true;
    unique_ptr<TeddyEngineDescription> teddyDescr =
        getTeddyDescription(eng.getID());
@ -378,101 +354,43 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
        makeConfirm = teddyDescr->needConfirm(lits);
    }

+    BC2CONF bc2Conf;
    u32 totalConfirmSize = 0;
    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
        if (!bucketToLits[b].empty()) {
-            vector<vector<hwlmLiteral>> vl(eng.getConfirmTopLevelSplit());
+            vector<hwlmLiteral> vl;
            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
-                hwlmLiteral lit = lits[lit_idx]; // copy
-                // c is last char of this literal
-                u8 c = *(lit.s.rbegin());
-
-                bool suppressSplit = false;
-                if (pullBack) {
-                    // make a shorter string to work over if we're pulling back
-                    // getFDRConfirm doesn't know about that stuff
-                    assert(lit.s.size() >= pullBack);
-                    lit.s.resize(lit.s.size() - pullBack);
-
-                    u8 c_sub, c_sub_msk;
-                    if (lit.msk.empty()) {
-                        c_sub = 0;
-                        c_sub_msk = 0;
-                    } else {
-                        c_sub = *(lit.cmp.rbegin());
-                        c_sub_msk = *(lit.msk.rbegin());
-                        size_t len = lit.msk.size() -
-                                     min(lit.msk.size(), (size_t)pullBack);
-                        lit.msk.resize(len);
-                        lit.cmp.resize(len);
-                    }
-
-                    // if c_sub_msk is 0xff and lit.nocase
-                    // resteer 'c' to an exact value and set suppressSplit
-                    if ((c_sub_msk == 0xff) && (lit.nocase)) {
-                        suppressSplit = true;
-                        c = c_sub;
-                    }
-                }
-
-                if (!suppressSplit && splitHasCase && lit.nocase &&
-                    ourisalpha(c)) {
-                    vl[(u8)(mytoupper(c) & splitMask)].push_back(lit);
-                    vl[(u8)(mytolower(c) & splitMask)].push_back(lit);
-                } else {
-                    vl[c & splitMask].push_back(lit);
-                }
+                vl.push_back(lits[lit_idx]);
            }

-            for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
-                if (vl[c].empty()) {
-                    continue;
-                }
-                DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
-                auto key = make_pair(b, c);
-                auto fc = getFDRConfirm(vl[c], eng.typicallyHoldsOneCharLits(),
-                                        make_small, makeConfirm);
-                totalConfirmSize += fc.second;
-                assert(bc2Conf.find(key) == end(bc2Conf));
-                bc2Conf.emplace(key, move(fc));
-            }
+            DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
+            auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+            totalConfirmSize += fc.size();
+            bc2Conf.emplace(b, move(fc));
        }
    }
-    return totalConfirmSize;
-}

-pair<aligned_unique_ptr<u8>, size_t>
-setupFullMultiConfs(const vector<hwlmLiteral> &lits,
-                    const EngineDescription &eng,
-                    map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
-                    bool make_small) {
-    BC2CONF bc2Conf;
-    u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
-                                              make_small);
-
-    u32 primarySwitch = eng.getConfirmTopLevelSplit();
    u32 nBuckets = eng.getNumBuckets();
-    u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
+    u32 totalConfSwitchSize = nBuckets * sizeof(u32);
    u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);

-    auto buf = aligned_zmalloc_unique<u8>(totalSize);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
    assert(buf); // otherwise would have thrown std::bad_alloc

    u32 *confBase = (u32 *)buf.get();
    u8 *ptr = buf.get() + totalConfSwitchSize;

    for (const auto &m : bc2Conf) {
-        const BucketIndex &b = m.first.first;
-        const u8 &c = m.first.second;
-        const pair<aligned_unique_ptr<FDRConfirm>, size_t> &p = m.second;
+        const BucketIndex &idx = m.first;
+        const bytecode_ptr<FDRConfirm> &p = m.second;
        // confirm offset is relative to the base of this structure, now
        u32 confirm_offset = verify_u32(ptr - buf.get());
-        memcpy(ptr, p.first.get(), p.second);
-        ptr += p.second;
-        u32 idx = c * nBuckets + b;
+        memcpy(ptr, p.get(), p.size());
+        ptr += p.size();
        confBase[idx] = confirm_offset;
    }
-    return {move(buf), totalSize};
+
+    return buf;
 }

 } // namespace ue2
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -40,8 +40,8 @@
 // the whole confirmation procedure
 static really_inline
 void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
-                 size_t i, u32 pullBackAmount, hwlmcb_rv_t *control,
-                 u32 *last_match, u64a conf_key) {
+                 size_t i, hwlmcb_rv_t *control, u32 *last_match,
+                 u64a conf_key) {
    assert(i < a->len);
    assert(ISALIGNED(fdrc));

@ -68,13 +68,10 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
            goto out;
        }

-        const u8 *loc = buf + i - li->size + 1 - pullBackAmount;
+        const u8 *loc = buf + i - li->size + 1;

-        u8 caseless = li->flags & Caseless;
        if (loc < buf) {
            u32 full_overhang = buf - loc;
-
-            const u8 *history = a->buf_history;
            size_t len_history = a->len_history;

            // can't do a vectored confirm either if we don't have
@ -82,44 +79,15 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
            if (full_overhang > len_history) {
                goto out;
            }
-
-            // as for the regular case, no need to do a full confirm if
-            // we're a short literal
-            if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                const u8 *s1 = (const u8 *)li + sizeof(*li);
-                const u8 *s2 = s1 + full_overhang;
-                const u8 *loc1 = history + len_history - full_overhang;
-                const u8 *loc2 = buf;
-                size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE));
-                size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang;
-                size_t size2 = wind_size2_back > li->size ?
-                    0 : li->size - wind_size2_back;
-
-                if (cmpForward(loc1, s1, size1, caseless)) {
-                    goto out;
-                }
-                if (cmpForward(loc2, s2, size2, caseless)) {
-                    goto out;
-                }
-            }
-        } else { // NON-VECTORING PATH
-
-            // if string < conf_type we don't need regular string cmp
-            if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                const u8 *s = (const u8 *)li + sizeof(*li);
-                if (cmpForward(loc, s, li->size - sizeof(CONF_TYPE),
-                               caseless)) {
-                    goto out;
-                }
-            }
        }
+        assert(li->size <= sizeof(CONF_TYPE));

        if (unlikely(!(li->groups & *control))) {
            goto out;
        }

        if (unlikely(li->flags & ComplexConfirm)) {
-            const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
+            const u8 *loc2 = buf + i - li->extended_size + 1;
            if (loc2 < buf) {
                u32 full_overhang = buf - loc2;
                size_t len_history = a->len_history;
@ -133,7 +101,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
        *control = a->cb(loc - buf, i, li->id, a->ctxt);
    out:
        oldNext = li->next; // oldNext is either 0 or an 'adjust' value
-        li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size);
+        li++;
    } while (oldNext);
 }

@ -148,7 +116,7 @@ void confWithBit1(const struct FDRConfirm *fdrc,
    assert(ISALIGNED(fdrc));

    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
+        confWithBit(fdrc, a, i, control, last_match, conf_key);
        return;
    } else {
        u32 id = fdrc->nBitsOrSoleID;
@ -176,7 +144,7 @@ void confWithBitMany(const struct FDRConfirm *fdrc,
    }

    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
+        confWithBit(fdrc, a, i, control, last_match, conf_key);
        return;
    } else {
        const u32 id = fdrc->nBitsOrSoleID;
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -44,8 +44,7 @@ namespace ue2 {

 FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
-                        def.numBuckets, def.confirmPullBackDistance,
-                        def.confirmTopLevelSplit),
+                        def.numBuckets),
      schemeWidth(def.schemeWidth), stride(0), bits(0) {}

 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
@ -55,7 +54,7 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
 }

 void getFdrDescriptions(vector<FDREngineDescription> *out) {
-    static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
+    static const FDREngineDef def = {0, 64, 8, 0};
    out->clear();
    out->emplace_back(def);
 }
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -43,8 +43,6 @@ struct FDREngineDef {
    u32 schemeWidth;
    u32 numBuckets;
    u64a cpu_features;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;
 };

 class FDREngineDescription : public EngineDescription {
@ -64,7 +62,6 @@ public:
    explicit FDREngineDescription(const FDREngineDef &def);

    u32 getDefaultFloodSuffixLength() const override;
-    bool typicallyHoldsOneCharLits() const override { return stride == 1; }
 };

 std::unique_ptr<FDREngineDescription>
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -71,11 +71,6 @@ struct FDR {
    u32 maxStringLen;
    u32 floodOffset;

-    /** link is the relative offset of a secondary included FDR table for
-     * stream handling if we're a primary FDR table or the subsidiary tertiary
-     * structures (spillover strings and hash table) if we're a secondary
-     * structure. */
-    u32 link;
    u8 stride; /* stride - how frequeuntly the data is consulted by the first
                * stage matcher */
    u8 domain; /* number of bits used to index into main FDR table. This value
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,7 @@
 #include "fdr_confirm.h"
 #include "fdr_compile_internal.h"
 #include "fdr_engine_description.h"
+#include "grey.h"
 #include "ue2common.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
@ -90,9 +91,9 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
   }
 }

-pair<aligned_unique_ptr<u8>, size_t>
-setupFDRFloodControl(const vector<hwlmLiteral> &lits,
-                     const EngineDescription &eng) {
+bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                                      const EngineDescription &eng,
+                                      const Grey &grey) {
    vector<FDRFlood> tmpFlood(N_CHARS);
    u32 default_suffix = eng.getDefaultFloodSuffixLength();

@ -187,6 +188,14 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
    }
 #endif

+    // If flood detection has been switched off in the grey box, we comply by
+    // setting idCount too high for all floods.
+    if (!grey.fdrAllowFlood) {
+        for (auto &fl : tmpFlood) {
+            fl.idCount = FDR_FLOOD_MAX_IDS;
+        }
+    }
+
    map<FDRFlood, CharReach, FloodComparator> flood2chars;
    for (u32 i = 0; i < N_CHARS; i++) {
        FDRFlood fl = tmpFlood[i];
@ -198,7 +207,7 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
    size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
    size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);

-    auto buf = aligned_zmalloc_unique<u8>(totalSize);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
    assert(buf); // otherwise would have thrown std::bad_alloc

    u32 *floodHeader = (u32 *)buf.get();
@ -218,7 +227,7 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
    DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
                 floodHeaderSize, floodStructSize, totalSize);

-    return {move(buf), totalSize};
+    return buf;
 }

 } // namespace ue2
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -100,7 +100,7 @@ const u8 * floodDetect(const struct FDR * fdr,
    // tryFloodDetect is never put in places where unconditional
    // reads a short distance forward or backward here
    // TODO: rationale for this line needs to be rediscovered!!
-    size_t mainLoopLen = len > iterBytes ? len - iterBytes : 0;
+    size_t mainLoopLen = len > 2 * iterBytes ? len - 2 * iterBytes : 0;
    const u32 i = ptr - buf;
    u32 j = i;

--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -129,7 +129,8 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
    m128 mask = set16x8(0xf);
    m128 lo = and128(val, mask);
    m128 hi = and128(rshift64_m128(val, 4), mask);
-    return and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi));
+    return and128(pshufb_m128(maskBase[0 * 2], lo),
+                  pshufb_m128(maskBase[0 * 2 + 1], hi));
 }

 static really_inline
@ -139,8 +140,8 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
    m128 hi = and128(rshift64_m128(val, 4), mask);
    m128 r = prep_conf_teddy_m1(maskBase, val);

-    m128 res_1 = and128(pshufb(maskBase[1*2], lo),
-                        pshufb(maskBase[1*2+1], hi));
+    m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo),
+                        pshufb_m128(maskBase[1*2+1], hi));
    m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
    *old_1 = res_1;
    return and128(r, res_shifted_1);
@ -154,8 +155,8 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
    m128 hi = and128(rshift64_m128(val, 4), mask);
    m128 r = prep_conf_teddy_m2(maskBase, old_1, val);

-    m128 res_2 = and128(pshufb(maskBase[2*2], lo),
-                        pshufb(maskBase[2*2+1], hi));
+    m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo),
+                        pshufb_m128(maskBase[2*2+1], hi));
    m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
    *old_2 = res_2;
    return and128(r, res_shifted_2);
@ -169,8 +170,8 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
    m128 hi = and128(rshift64_m128(val, 4), mask);
    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);

-    m128 res_3 = and128(pshufb(maskBase[3*2], lo),
-                        pshufb(maskBase[3*2+1], hi));
+    m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo),
+                        pshufb_m128(maskBase[3*2+1], hi));
    m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
    *old_3 = res_3;
    return and128(r, res_shifted_3);
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,6 +34,7 @@
 #define TEDDY_H_

 #include "hwlm/hwlm.h" // for hwlm_group_t
+#include "util/arch.h"

 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
@ -70,7 +71,7 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a,
                                      hwlm_group_t control);

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)

 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a,
@ -104,15 +105,6 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a,
                                               hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control);
-
-hwlm_error_t
-fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                   const struct FDR_Runtime_Args *a,
-                                   hwlm_group_t control);
-
-#endif /* __AVX2__ */
+#endif /* HAVE_AVX2 */

 #endif /* TEDDY_H_ */
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -35,78 +35,10 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"

-#if defined(__AVX2__)
-
-static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
-};
+#if defined(HAVE_AVX2)

 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
@ -199,22 +131,6 @@ do {                                                                        \
 } while (0);
 #endif

-#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn)                    \
-do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
-        u32 arrCnt = 0;                                                     \
-        m128 lo = cast256to128(var);                                        \
-        m128 hi = movdq_hi(var);                                            \
-        bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
-        bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
-        for (u32 i = 0; i < arrCnt; i++) {                                  \
-            conf_fn(bitArr[i], confBase, reason, a, ptr, &control,          \
-                    &last_match);                                           \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-    }                                                                       \
-} while (0);
-
 static really_inline
 m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
                       const u8 *buf_history, size_t len_history,
@ -226,193 +142,13 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
    return ret;
 }

-/*
- * \brief Copy a block of [0,31] bytes efficiently.
- *
- * This function is a workaround intended to stop some compilers from
- * synthesizing a memcpy function call out of the copy of a small number of
- * bytes that we do in vectoredLoad128.
- */
-static really_inline
-void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
-    switch (len) {
-    case 0:
-        break;
-    case 1:
-        *dst = *src;
-        break;
-    case 2:
-        unaligned_store_u16(dst, unaligned_load_u16(src));
-        break;
-    case 3:
-        unaligned_store_u16(dst, unaligned_load_u16(src));
-        dst[2] = src[2];
-        break;
-    case 4:
-        unaligned_store_u32(dst, unaligned_load_u32(src));
-        break;
-    case 5:
-    case 6:
-    case 7:
-        /* Perform copy with two overlapping 4-byte chunks. */
-        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
-        unaligned_store_u32(dst, unaligned_load_u32(src));
-        break;
-    case 8:
-        unaligned_store_u64a(dst, unaligned_load_u64a(src));
-        break;
-    case 9:
-    case 10:
-    case 11:
-    case 12:
-    case 13:
-    case 14:
-    case 15:
-        /* Perform copy with two overlapping 8-byte chunks. */
-        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
-        unaligned_store_u64a(dst, unaligned_load_u64a(src));
-        break;
-    case 16:
-        storeu128(dst, loadu128(src));
-        break;
-    default:
-        /* Perform copy with two overlapping 16-byte chunks. */
-        assert(len < 32);
-        storeu128(dst + len - 16, loadu128(src + len - 16));
-        storeu128(dst, loadu128(src));
-        break;
-    }
-}
-
-static really_inline
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
-                     const u8 *buf_history, size_t len_history) {
-    union {
-        u8 val8[32];
-        m256 val256;
-    } u;
-
-    uintptr_t copy_start;
-    uintptr_t copy_len;
-
-    if (ptr >= lo) {
-        uintptr_t avail = (uintptr_t)(hi - ptr);
-        if (avail >= 32) {
-            *p_mask = load256(p_mask_arr256[32] + 32);
-            return loadu256(ptr);
-        }
-        *p_mask = load256(p_mask_arr256[avail] + 32);
-        copy_start = 0;
-        copy_len = avail;
-    } else {
-        // need contains "how many chars to pull from history"
-        // calculate based on what we need, what we have in the buffer
-        // and only what we need to make primary confirm work
-        uintptr_t start = (uintptr_t)(lo - ptr);
-        uintptr_t i;
-        for (i = start; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
-        }
-        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
-        *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start);
-        copy_start = i;
-        copy_len = end - i;
-    }
-
-    // Runt block from the buffer.
-    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
-
-    return u.val256;
-}
-
-static really_inline
-void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase,
-                                CautionReason reason,
-                                const struct FDR_Runtime_Args *a,
-                                const u8 *ptr, hwlmcb_rv_t *control,
-                                u32 *last_match) {
-    u32 byte = bits / 8;
-    u32 cf = confBase[bits % 8];
-    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                    ((const u8 *)confBase + cf);
-    u64a confVal = getConfVal(a, ptr, byte, reason);
-    confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal);
-}
-
-static really_inline
-void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase,
-                               CautionReason reason,
-                               const struct FDR_Runtime_Args *a, const u8 *ptr,
-                               hwlmcb_rv_t *control, u32 *last_match) {
-    u32 byte = bits / 8;
-    u32 bitRem = bits % 8;
-    u32 confSplit = *(ptr+byte) & 0x1f;
-    u32 idx = confSplit * 8 + bitRem;
-    u32 cf = confBase[idx];
-    if (!cf) {
-        return;
-    }
-    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                    ((const u8 *)confBase + cf);
-    if (!(fdrc->groups & *control)) {
-        return;
-    }
-    u64a confVal = getConfVal(a, ptr, byte, reason);
-    confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, last_match, confVal);
-}
-
-static really_inline
-void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
-    if (unlikely(isnonzero128(var))) {
-#ifdef ARCH_64_BIT
-        u64a part_0 = movq(var);
-        while (unlikely(part_0)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
-                                    64 * (offset);
-            *arrCnt += 1;
-        }
-        u64a part_1 = movq(rshiftbyte_m128(var, 8));
-        while (unlikely(part_1)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
-                                    64 * (offset + 1);
-            *arrCnt += 1;
-        }
-#else
-        u32 part_0 = movd(var);
-        while (unlikely(part_0)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
-                                    32 * (offset * 2);
-            *arrCnt += 1;
-        }
-        u32 part_1 = movd(rshiftbyte_m128(var, 4));
-        while (unlikely(part_1)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
-                                    32 * (offset * 2 + 1);
-            *arrCnt += 1;
-        }
-        u32 part_2 = movd(rshiftbyte_m128(var, 8));
-        while (unlikely(part_2)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
-                                    32 * (offset * 2 + 2);
-            *arrCnt += 1;
-        }
-        u32 part_3 = movd(rshiftbyte_m128(var, 12));
-        while (unlikely(part_3)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
-                                    32 * (offset * 2 + 3);
-            *arrCnt += 1;
-        }
-#endif
-    }
-}
-
 static really_inline
 m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
    m256 mask = set32x8(0xf);
    m256 lo = and256(val, mask);
    m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(vpshufb(maskBase[0*2], lo),
-                  vpshufb(maskBase[0*2+1], hi));
+    return and256(pshufb_m256(maskBase[0*2], lo),
+                  pshufb_m256(maskBase[0*2+1], hi));
 }

 static really_inline
@ -422,8 +158,8 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
    m256 hi = and256(rshift64_m256(val, 4), mask);
    m256 r = prep_conf_fat_teddy_m1(maskBase, val);

-    m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
-                        vpshufb(maskBase[1*2+1], hi));
+    m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
+                        pshufb_m256(maskBase[1*2+1], hi));
    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
    *old_1 = res_1;
    return and256(r, res_shifted_1);
@ -437,8 +173,8 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
    m256 hi = and256(rshift64_m256(val, 4), mask);
    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);

-    m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
-                        vpshufb(maskBase[2*2+1], hi));
+    m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
+                        pshufb_m256(maskBase[2*2+1], hi));
    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
    *old_2 = res_2;
    return and256(r, res_shifted_2);
@ -452,20 +188,13 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
    m256 hi = and256(rshift64_m256(val, 4), mask);
    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);

-    m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
-                        vpshufb(maskBase[3*2+1], hi));
+    m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
+                        pshufb_m256(maskBase[3*2+1], hi));
    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
    *old_3 = res_3;
    return and256(r, res_shifted_3);
 }

-static really_inline
-m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) {
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
-}
-
 static really_inline
 const m256 * getMaskBase_avx2(const struct Teddy *teddy) {
    return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy));
@ -959,136 +688,4 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
    return HWLM_SUCCESS;
 }

-hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 64;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const m256 maskLo = set2x128(maskBase[0]);
-    const m256 maskHi = set2x128(maskBase[1]);
-    const m256 mask = set32x8(0xf);
-    u16 bitArr[512];
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 32;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-        ptr += 32;
-    }
-
-    if (ptr + 32 < buf_end) {
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-        ptr += 32;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
-
-        m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 32) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 64;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const m256 maskLo = set2x128(maskBase[0]);
-    const m256 maskHi = set2x128(maskBase[1]);
-    const m256 mask = set32x8(0xf);
-    u16 bitArr[512];
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 32;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-        ptr += 32;
-    }
-
-    if (ptr + 32 < buf_end) {
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-        ptr += 32;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
-
-        m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 32) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-#endif // __AVX2__
+#endif // HAVE_AVX2
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,22 +26,29 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+/**
+ * \file
+ * \brief FDR literal matcher: Teddy build code.
+ */
+
+#include "teddy_compile.h"
+
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_compile_internal.h"
 #include "fdr_confirm.h"
 #include "fdr_engine_description.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+#include "grey.h"
 #include "ue2common.h"
 #include "util/alloc.h"
 #include "util/compare.h"
+#include "util/noncopyable.h"
 #include "util/popcount.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"

-#include "teddy_compile.h"
-#include "teddy_internal.h"
-#include "teddy_engine_description.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@ -54,8 +61,6 @@
 #include <string>
 #include <vector>

-#include <boost/core/noncopyable.hpp>
-
 using namespace std;

 namespace ue2 {
@ -64,17 +69,20 @@ namespace {

 //#define TEDDY_DEBUG

-class TeddyCompiler : boost::noncopyable {
+class TeddyCompiler : noncopyable {
    const TeddyEngineDescription &eng;
+    const Grey &grey;
    const vector<hwlmLiteral> &lits;
    bool make_small;

 public:
    TeddyCompiler(const vector<hwlmLiteral> &lits_in,
-                  const TeddyEngineDescription &eng_in, bool make_small_in)
-        : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
+                  const TeddyEngineDescription &eng_in, bool make_small_in,
+                  const Grey &grey_in)
+        : eng(eng_in), grey(grey_in), lits(lits_in), make_small(make_small_in) {
+    }

-    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> build();
    bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };

@ -274,8 +282,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
    return true;
 }

-aligned_unique_ptr<FDR>
-TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> TeddyCompiler::build() {
    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
        return nullptr;
@ -308,16 +315,16 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {

    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;

-    auto floodControlTmp = setupFDRFloodControl(lits, eng);
-    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);

    size_t size = ROUNDUP_N(sizeof(Teddy) +
-                             maskLen +
-                             confirmTmp.second +
-                             floodControlTmp.second +
-                             link.second, 16 * maskWidth);
+                            maskLen +
+                            confirmTmp.size() +
+                            floodControlTmp.size(),
+                            16 * maskWidth);

-    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
    assert(fdr); // otherwise would have thrown std::bad_alloc
    Teddy *teddy = (Teddy *)fdr.get(); // ugly
    u8 *teddy_base = (u8 *)teddy;
@ -327,19 +334,12 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
    teddy->maxStringLen = verify_u32(maxLen(lits));

    u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
-    ptr += confirmTmp.second;
+    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
+    ptr += confirmTmp.size();

    teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
-    ptr += floodControlTmp.second;
-
-    if (link.first) {
-        teddy->link = verify_u32(ptr - teddy_base);
-        memcpy(ptr, link.first.get(), link.second);
-    } else {
-        teddy->link = 0;
-    }
+    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
+    ptr += floodControlTmp.size();

    u8 *baseMsk = teddy_base + sizeof(Teddy);

@ -423,10 +423,10 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {

 } // namespace

-aligned_unique_ptr<FDR>
-teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
-                      u32 hint, const target_t &target,
-                      pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                        bool make_small, u32 hint,
+                                        const target_t &target,
+                                        const Grey &grey) {
    unique_ptr<TeddyEngineDescription> des;
    if (hint == HINT_INVALID) {
        des = chooseTeddyEngine(target, lits);
@ -436,8 +436,8 @@ teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
    if (!des) {
        return nullptr;
    }
-    TeddyCompiler tc(lits, *des, make_small);
-    return tc.build(link);
+    TeddyCompiler tc(lits, *des, make_small, grey);
+    return tc.build();
 }

 } // namespace ue2
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,8 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
+/**
+ * \file
 * \brief FDR literal matcher: Teddy build API.
 */

@ -34,22 +35,22 @@
 #define TEDDY_COMPILE_H

 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"

 #include <vector>
-#include <utility> // std::pair

 struct FDR;
-struct target_t;

 namespace ue2 {

+struct Grey;
 struct hwlmLiteral;
+struct target_t;

-ue2::aligned_unique_ptr<FDR>
-teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
-                      u32 hint, const target_t &target,
-                      std::pair<aligned_unique_ptr<u8>, size_t> &link);
+bytecode_ptr<FDR> teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits,
+                                        bool make_small, u32 hint,
+                                        const target_t &target,
+                                        const Grey &grey);

 } // namespace ue2

--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -44,8 +44,7 @@ namespace ue2 {

 TeddyEngineDescription::TeddyEngineDescription(const TeddyEngineDef &def)
    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
-                        def.numBuckets, def.confirmPullBackDistance,
-                        def.confirmTopLevelSplit),
+                        def.numBuckets),
      numMasks(def.numMasks), packed(def.packed) {}

 u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
@ -66,24 +65,22 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const

 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
    static const TeddyEngineDef defns[] = {
-        { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
-        { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
-        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
-        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
-        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
-        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
-        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
-        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
-        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
-        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
-        { 11, 0, 1, 8, false, 0, 1 },
-        { 12, 0, 1, 8, true, 0, 32 },
-        { 13, 0, 2, 8, false, 0, 1 },
-        { 14, 0, 2, 8, true, 0, 32 },
-        { 15, 0, 3, 8, false, 0, 1 },
-        { 16, 0, 3, 8, true, 0, 32 },
-        { 17, 0, 4, 8, false, 0, 1 },
-        { 18, 0, 4, 8, true, 0, 32 },
+        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
+        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
+        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
+        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
+        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
+        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
+        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
+        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
+        { 11, 0, 1, 8, false },
+        { 12, 0, 1, 8, true },
+        { 13, 0, 2, 8, false },
+        { 14, 0, 2, 8, true },
+        { 15, 0, 3, 8, false },
+        { 16, 0, 3, 8, true },
+        { 17, 0, 4, 8, false },
+        { 18, 0, 4, 8, true },
    };
    out->clear();
    for (const auto &def : defns) {
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -45,8 +45,6 @@ struct TeddyEngineDef {
    u32 numMasks;
    u32 numBuckets;
    bool packed;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;
 };

 class TeddyEngineDescription : public EngineDescription {
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -180,9 +180,7 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
    do  {
        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
        u32 byte = bit / bucket + offset;
-        u32 bitRem  = bit % bucket;
-        u32 confSplit = *(ptr+byte) & 0x1f;
-        u32 idx = confSplit * bucket + bitRem;
+        u32 idx  = bit % bucket;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
@ -193,7 +191,7 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
            continue;
        }
        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
+        confWithBit(fdrc, a, ptr - a->buf + byte, control,
                    last_match, confVal);
    } while (unlikely(*conf));
 }
--- a/src/grey.cpp
+++ b/src/grey.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,6 +42,7 @@ namespace ue2 {

 Grey::Grey(void) :
                   optimiseComponentTree(true),
+                   calcComponents(true),
                   performGraphSimplification(true),
                   prefilterReductions(true),
                   removeEdgeRedundancy(true),
@ -54,7 +55,6 @@ Grey::Grey(void) :
                   allowMcSheng(true),
                   allowPuff(true),
                   allowLiteral(true),
-                   allowRose(true),
                   allowViolet(true),
                   allowExtendedNFA(true), /* bounded repeats of course */
                   allowLimExNFA(true),
@ -62,8 +62,10 @@ Grey::Grey(void) :
                   allowSmallLiteralSet(true),
                   allowCastle(true),
                   allowDecoratedLiteral(true),
+                   allowApproximateMatching(true),
                   allowNoodle(true),
                   fdrAllowTeddy(true),
+                   fdrAllowFlood(true),
                   violetAvoidSuffixes(true),
                   violetAvoidWeakInfixes(true),
                   violetDoubleCut(true),
@ -98,6 +100,7 @@ Grey::Grey(void) :
                   minRoseLiteralLength(3),
                   minRoseNetflowLiteralLength(2),
                   maxRoseNetflowEdges(50000), /* otherwise no netflow pass. */
+                   maxEditDistance(16),
                   minExtBoundedRepeatSize(32),
                   goughCopyPropagate(true),
                   goughRegisterAllocate(true),
@ -105,8 +108,6 @@ Grey::Grey(void) :
                   roseGraphReduction(true),
                   roseRoleAliasing(true),
                   roseMasks(true),
-                   roseMaxBadLeafLength(5),
-                   roseConvertInfBadLeaves(true),
                   roseConvertFloodProneSuffixes(true),
                   roseMergeRosesDuringAliasing(true),
                   roseMultiTopRoses(true),
@ -116,7 +117,6 @@ Grey::Grey(void) :
                   roseMcClellanSuffix(1),
                   roseMcClellanOutfix(2),
                   roseTransformDelay(true),
-                   roseDesiredSplit(4),
                   earlyMcClellanPrefix(true),
                   earlyMcClellanInfix(true),
                   earlyMcClellanSuffix(true),
@ -157,7 +157,8 @@ Grey::Grey(void) :
                   limitEngineSize(1073741824), // 1 GB
                   limitDFASize(1073741824), // 1 GB
                   limitNFASize(1048576), // 1 MB
-                   limitLBRSize(1048576) // 1 MB
+                   limitLBRSize(1048576), // 1 MB
+                   limitApproxMatchingVertices(5000)
 {
    assert(maxAnchoredRegion < 64); /* a[lm]_log_sum have limited capacity */
 }
@ -209,6 +210,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
        } while (0)

        G_UPDATE(optimiseComponentTree);
+        G_UPDATE(calcComponents);
        G_UPDATE(performGraphSimplification);
        G_UPDATE(prefilterReductions);
        G_UPDATE(removeEdgeRedundancy);
@ -221,7 +223,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(allowMcSheng);
        G_UPDATE(allowPuff);
        G_UPDATE(allowLiteral);
-        G_UPDATE(allowRose);
        G_UPDATE(allowViolet);
        G_UPDATE(allowExtendedNFA);
        G_UPDATE(allowLimExNFA);
@ -230,7 +231,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(allowCastle);
        G_UPDATE(allowDecoratedLiteral);
        G_UPDATE(allowNoodle);
+        G_UPDATE(allowApproximateMatching);
        G_UPDATE(fdrAllowTeddy);
+        G_UPDATE(fdrAllowFlood);
        G_UPDATE(violetAvoidSuffixes);
        G_UPDATE(violetAvoidWeakInfixes);
        G_UPDATE(violetDoubleCut);
@ -265,6 +268,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(minRoseLiteralLength);
        G_UPDATE(minRoseNetflowLiteralLength);
        G_UPDATE(maxRoseNetflowEdges);
+        G_UPDATE(maxEditDistance);
        G_UPDATE(minExtBoundedRepeatSize);
        G_UPDATE(goughCopyPropagate);
        G_UPDATE(goughRegisterAllocate);
@ -272,8 +276,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(roseGraphReduction);
        G_UPDATE(roseRoleAliasing);
        G_UPDATE(roseMasks);
-        G_UPDATE(roseMaxBadLeafLength);
-        G_UPDATE(roseConvertInfBadLeaves);
        G_UPDATE(roseConvertFloodProneSuffixes);
        G_UPDATE(roseMergeRosesDuringAliasing);
        G_UPDATE(roseMultiTopRoses);
@ -283,7 +285,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(roseMcClellanSuffix);
        G_UPDATE(roseMcClellanOutfix);
        G_UPDATE(roseTransformDelay);
-        G_UPDATE(roseDesiredSplit);
        G_UPDATE(earlyMcClellanPrefix);
        G_UPDATE(earlyMcClellanInfix);
        G_UPDATE(earlyMcClellanSuffix);
@ -319,6 +320,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(limitDFASize);
        G_UPDATE(limitNFASize);
        G_UPDATE(limitLBRSize);
+        G_UPDATE(limitApproxMatchingVertices);

 #undef G_UPDATE
        if (key == "simple_som") {
@ -340,7 +342,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
            g->allowMcClellan = false;
            g->allowPuff = false;
            g->allowLiteral = false;
-            g->allowRose = false;
            g->allowViolet = false;
            g->allowSmallLiteralSet = false;
            g->roseMasks = false;
@ -358,7 +359,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
            g->allowMcClellan = true;
            g->allowPuff = false;
            g->allowLiteral = false;
-            g->allowRose = false;
            g->allowViolet = false;
            g->allowSmallLiteralSet = false;
            g->roseMasks = false;
@ -376,7 +376,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
            g->allowMcClellan = true;
            g->allowPuff = false;
            g->allowLiteral = false;
-            g->allowRose = false;
            g->allowViolet = false;
            g->allowSmallLiteralSet = false;
            g->roseMasks = false;
--- a/src/grey.h
+++ b/src/grey.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -41,6 +41,7 @@ struct Grey {

    bool optimiseComponentTree;

+    bool calcComponents;
    bool performGraphSimplification;
    bool prefilterReductions;
    bool removeEdgeRedundancy;
@ -54,7 +55,6 @@ struct Grey {
    bool allowMcSheng;
    bool allowPuff;
    bool allowLiteral;
-    bool allowRose;
    bool allowViolet;
    bool allowExtendedNFA;
    bool allowLimExNFA;
@ -62,9 +62,11 @@ struct Grey {
    bool allowSmallLiteralSet;
    bool allowCastle;
    bool allowDecoratedLiteral;
+    bool allowApproximateMatching;

    bool allowNoodle;
    bool fdrAllowTeddy;
+    bool fdrAllowFlood;

    u32  violetAvoidSuffixes; /* 0=never, 1=sometimes, 2=always */
    bool violetAvoidWeakInfixes;
@ -107,6 +109,7 @@ struct Grey {
    u32 minRoseLiteralLength;
    u32 minRoseNetflowLiteralLength;
    u32 maxRoseNetflowEdges;
+    u32 maxEditDistance;

    u32 minExtBoundedRepeatSize; /* to be considered for ng_repeat */

@ -118,8 +121,6 @@ struct Grey {
    bool roseGraphReduction;
    bool roseRoleAliasing;
    bool roseMasks;
-    u32 roseMaxBadLeafLength;
-    bool roseConvertInfBadLeaves;
    bool roseConvertFloodProneSuffixes;
    bool roseMergeRosesDuringAliasing;
    bool roseMultiTopRoses;
@ -130,7 +131,6 @@ struct Grey {
                              * always */
    u32 roseMcClellanOutfix; /* 0 = off, 1 = sometimes, 2 = almost always */
    bool roseTransformDelay;
-    u32 roseDesiredSplit;

    bool earlyMcClellanPrefix;
    bool earlyMcClellanInfix;
@ -202,6 +202,9 @@ struct Grey {
    u32 limitDFASize;    //!< max size of a DFA (in bytes)
    u32 limitNFASize;    //!< max size of an NFA (in bytes)
    u32 limitLBRSize;    //!< max size of an LBR engine (in bytes)
+
+    // Approximate matching limits.
+    u32 limitApproxMatchingVertices; //!< max number of vertices per graph
 };

 #ifndef RELEASE_BUILD
--- a/src/hs.cpp
+++ b/src/hs.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -39,10 +39,10 @@
 #include "compiler/error.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_expr_info.h"
-#include "nfagraph/ng_extparam.h"
-#include "parser/parse_error.h"
 #include "parser/Parser.h"
+#include "parser/parse_error.h"
 #include "parser/prefilter.h"
+#include "parser/unsupported.h"
 #include "util/compile_error.h"
 #include "util/cpuid_flags.h"
 #include "util/depth.h"
@ -119,8 +119,9 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {

 static
 bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
-#define HS_TUNE_LAST HS_TUNE_FAMILY_BDW
-#define HS_CPU_FEATURES_ALL (HS_CPU_FEATURES_AVX2)
+    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM;
+    static constexpr u32 HS_CPU_FEATURES_ALL =
+        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512;

    if (!p) {
        return true;
@ -277,9 +278,10 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
 } // namespace ue2

 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile(const char *expression, unsigned flags, unsigned mode,
-                      const hs_platform_info_t *platform, hs_database_t **db,
-                      hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile(const char *expression, unsigned flags,
+                               unsigned mode,
+                               const hs_platform_info_t *platform,
+                               hs_database_t **db, hs_compile_error_t **error) {
    if (expression == nullptr) {
        *db = nullptr;
        *error = generateCompileError("Invalid parameter: expression is NULL",
@ -295,24 +297,25 @@ hs_error_t hs_compile(const char *expression, unsigned flags, unsigned mode,
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile_multi(const char * const *expressions,
-                            const unsigned *flags, const unsigned *ids,
-                            unsigned elements, unsigned mode,
-                            const hs_platform_info_t *platform,
-                            hs_database_t **db, hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
+                                     const unsigned *flags, const unsigned *ids,
+                                     unsigned elements, unsigned mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error) {
    const hs_expr_ext * const *ext = nullptr; // unused for this call.
    return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
                                platform, db, error, Grey());
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile_ext_multi(const char * const *expressions,
-                                const unsigned *flags, const unsigned *ids,
-                                const hs_expr_ext * const *ext,
-                                unsigned elements, unsigned mode,
-                                const hs_platform_info_t *platform,
-                                hs_database_t **db,
-                                hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile_ext_multi(const char * const *expressions,
+                                     const unsigned *flags, const unsigned *ids,
+                                     const hs_expr_ext * const *ext,
+                                     unsigned elements, unsigned mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error) {
    return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
                                platform, db, error, Grey());
 }
@ -368,19 +371,28 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
        assert(pe.component);

        // Apply prefiltering transformations if desired.
-        if (pe.prefilter) {
+        if (pe.expr.prefilter) {
            prefilterTree(pe.component, ParseMode(flags));
        }

-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, pe);
+        // Expressions containing zero-width assertions and other extended pcre
+        // types aren't supported yet. This call will throw a ParseError
+        // exception if the component tree contains such a construct.
+        checkUnsupported(*pe.component);
+
+        pe.component->checkEmbeddedStartAnchor(true);
+        pe.component->checkEmbeddedEndAnchor(true);
+
+        auto built_expr = buildGraph(rm, cc, pe);
+        unique_ptr<NGHolder> &g = built_expr.g;
+        ExpressionInfo &expr = built_expr.expr;

        if (!g) {
            DEBUG_PRINTF("NFA build failed, but no exception was thrown.\n");
            throw ParseError("Internal error.");
        }

-        handleExtendedParams(rm, *g, cc);
-        fillExpressionInfo(rm, *g, &local_info);
+        fillExpressionInfo(rm, cc, *g, expr, &local_info);
    }
    catch (const CompileError &e) {
        // Compiler error occurred
@ -409,24 +421,26 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_expression_info(const char *expression, unsigned int flags,
-                              hs_expr_info_t **info,
-                              hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_expression_info(const char *expression,
+                                       unsigned int flags,
+                                       hs_expr_info_t **info,
+                                       hs_compile_error_t **error) {
    return hs_expression_info_int(expression, flags, nullptr, HS_MODE_BLOCK,
                                  info, error);
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
-                                  const hs_expr_ext_t *ext,
-                                  hs_expr_info_t **info,
-                                  hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
+                                           unsigned int flags,
+                                           const hs_expr_ext_t *ext,
+                                           hs_expr_info_t **info,
+                                           hs_compile_error_t **error) {
    return hs_expression_info_int(expression, flags, ext, HS_MODE_BLOCK, info,
                                  error);
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
+hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform) {
    if (!platform) {
        return HS_INVALID;
    }
@ -440,7 +454,7 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
 }

 extern "C" HS_PUBLIC_API
-hs_error_t hs_free_compile_error(hs_compile_error_t *error) {
+hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) {
 #if defined(FAT_RUNTIME)
    if (!check_ssse3()) {
        return HS_ARCH_ERROR;
--- a/src/hs_common.h
+++ b/src/hs_common.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,6 +29,11 @@
 #ifndef HS_COMMON_H_
 #define HS_COMMON_H_

+#if defined(_WIN32)
+#define HS_CDECL    __cdecl
+#else
+#define HS_CDECL
+#endif
 #include <stdlib.h>

 /**
@ -76,7 +81,7 @@ typedef int hs_error_t;
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_free_database(hs_database_t *db);
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db);

 /**
 * Serialize a pattern database to a stream of bytes.
@ -100,8 +105,8 @@ hs_error_t hs_free_database(hs_database_t *db);
 *      @ref HS_SUCCESS on success, @ref HS_NOMEM if the byte array cannot be
 *      allocated, other values may be returned if errors are detected.
 */
-hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
-                                 size_t *length);
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *length);

 /**
 * Reconstruct a pattern database from a stream of bytes previously generated
@ -129,8 +134,9 @@ hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
-                                   hs_database_t **db);
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db);

 /**
 * Reconstruct a pattern database from a stream of bytes previously generated
@ -160,8 +166,9 @@ hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
-                                      hs_database_t *db);
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db);

 /**
 * Provides the size of the stream state allocated by a single stream opened
@ -177,7 +184,8 @@ hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_stream_size(const hs_database_t *database, size_t *stream_size);
+hs_error_t HS_CDECL hs_stream_size(const hs_database_t *database,
+                                   size_t *stream_size);

 /**
 * Provides the size of the given database in bytes.
@ -192,8 +200,8 @@ hs_error_t hs_stream_size(const hs_database_t *database, size_t *stream_size);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_database_size(const hs_database_t *database,
-                            size_t *database_size);
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *database,
+                                     size_t *database_size);

 /**
 * Utility function for reporting the size that would be required by a
@ -219,8 +227,9 @@ hs_error_t hs_database_size(const hs_database_t *database,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
-                                       size_t *deserialized_size);
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *deserialized_size);

 /**
 * Utility function providing information about a database.
@ -237,7 +246,8 @@ hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_database_info(const hs_database_t *database, char **info);
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *database,
+                                     char **info);

 /**
 * Utility function providing information about a serialized database.
@ -258,8 +268,8 @@ hs_error_t hs_database_info(const hs_database_t *database, char **info);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
-                                       char **info);
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info);

 /**
 * The type of the callback function that will be used by Hyperscan to allocate
@ -275,7 +285,7 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
 * @return
 *      A pointer to the region of memory allocated, or NULL on error.
 */
-typedef void *(*hs_alloc_t)(size_t size);
+typedef void *(HS_CDECL *hs_alloc_t)(size_t size);

 /**
 * The type of the callback function that will be used by Hyperscan to free
@ -284,7 +294,7 @@ typedef void *(*hs_alloc_t)(size_t size);
 * @param ptr
 *      The region of memory to be freed.
 */
-typedef void (*hs_free_t)(void *ptr);
+typedef void (HS_CDECL *hs_free_t)(void *ptr);

 /**
 * Set the allocate and free functions used by Hyperscan for allocating
@ -312,7 +322,8 @@ typedef void (*hs_free_t)(void *ptr);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_set_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t alloc_func,
+                                     hs_free_t free_func);

 /**
 * Set the allocate and free functions used by Hyperscan for allocating memory
@ -344,8 +355,8 @@ hs_error_t hs_set_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_set_database_allocator(hs_alloc_t alloc_func,
-                                     hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t alloc_func,
+                                              hs_free_t free_func);

 /**
 * Set the allocate and free functions used by Hyperscan for allocating memory
@ -371,7 +382,8 @@ hs_error_t hs_set_database_allocator(hs_alloc_t alloc_func,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_set_misc_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t alloc_func,
+                                          hs_free_t free_func);

 /**
 * Set the allocate and free functions used by Hyperscan for allocating memory
@ -397,7 +409,8 @@ hs_error_t hs_set_misc_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_set_scratch_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t alloc_func,
+                                             hs_free_t free_func);

 /**
 * Set the allocate and free functions used by Hyperscan for allocating memory
@ -423,7 +436,8 @@ hs_error_t hs_set_scratch_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t alloc_func,
+                                            hs_free_t free_func);

 /**
 * Utility function for identifying this release version.
@ -433,7 +447,7 @@ hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
 *      date of the build. It is allocated statically, so it does not need to
 *      be freed by the caller.
 */
-const char *hs_version(void);
+const char * HS_CDECL hs_version(void);

 /**
 * Utility function to test the current system architecture.
@ -450,7 +464,7 @@ const char *hs_version(void);
 *      @ref HS_SUCCESS on success, @ref HS_ARCH_ERROR if system does not
 *      support Hyperscan.
 */
-hs_error_t hs_valid_platform(void);
+hs_error_t HS_CDECL hs_valid_platform(void);

 /**
 * @defgroup HS_ERROR hs_error_t values
@ -545,7 +559,7 @@ hs_error_t hs_valid_platform(void);
 * At a minimum, Hyperscan requires Supplemental Streaming SIMD Extensions 3
 * (SSSE3).
 */
-#define HS_ARCH_ERROR		    (-11)
+#define HS_ARCH_ERROR           (-11)

 /** @} */

--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -169,13 +169,23 @@ typedef struct hs_platform_info {
 typedef struct hs_expr_info {
    /**
     * The minimum length in bytes of a match for the pattern.
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative lower bound for the true minimum length of
+     * a match.
     */
    unsigned int min_width;

    /**
     * The maximum length in bytes of a match for the pattern. If the pattern
-     * has an unbounded maximum width, this will be set to the maximum value of
-     * an unsigned int (UINT_MAX).
+     * has an unbounded maximum length, this will be set to the maximum value
+     * of an unsigned int (UINT_MAX).
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative upper bound for the true maximum length of
+     * a match.
     */
    unsigned int max_width;

@ -241,6 +251,13 @@ typedef struct hs_expr_ext {
     * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
     */
    unsigned long long min_length;
+
+    /**
+     * Allow patterns to approximately match within this edit distance. To use
+     * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the
+     * hs_expr_ext::flags field.
+     */
+    unsigned edit_distance;
 } hs_expr_ext_t;

 /**
@ -261,6 +278,9 @@ typedef struct hs_expr_ext {
 /** Flag indicating that the hs_expr_ext::min_length field is used. */
 #define HS_EXT_FLAG_MIN_LENGTH      4ULL

+/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
+#define HS_EXT_FLAG_EDIT_DISTANCE   8ULL
+
 /** @} */

 /**
@ -323,9 +343,10 @@ typedef struct hs_expr_ext {
 *      HS_COMPILER_ERROR on failure, with details provided in the error
 *      parameter.
 */
-hs_error_t hs_compile(const char *expression, unsigned int flags,
-                      unsigned int mode, const hs_platform_info_t *platform,
-                      hs_database_t **db, hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags,
+                               unsigned int mode,
+                               const hs_platform_info_t *platform,
+                               hs_database_t **db, hs_compile_error_t **error);

 /**
 * The multiple regular expression compiler.
@ -401,11 +422,13 @@ hs_error_t hs_compile(const char *expression, unsigned int flags,
 *      parameter.
 *
 */
-hs_error_t hs_compile_multi(const char *const *expressions,
-                            const unsigned int *flags, const unsigned int *ids,
-                            unsigned int elements, unsigned int mode,
-                            const hs_platform_info_t *platform,
-                            hs_database_t **db, hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
+                                     const unsigned int *flags,
+                                     const unsigned int *ids,
+                                     unsigned int elements, unsigned int mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error);

 /**
 * The multiple regular expression compiler with extended parameter support.
@ -486,7 +509,7 @@ hs_error_t hs_compile_multi(const char *const *expressions,
 *      parameter.
 *
 */
-hs_error_t hs_compile_ext_multi(const char *const *expressions,
+hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
                                const unsigned int *flags,
                                const unsigned int *ids,
                                const hs_expr_ext_t *const *ext,
@ -505,13 +528,24 @@ hs_error_t hs_compile_ext_multi(const char *const *expressions,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_free_compile_error(hs_compile_error_t *error);
+hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);

 /**
 * Utility function providing information about a regular expression. The
 * information provided in @ref hs_expr_info_t includes the minimum and maximum
 * width of a pattern match.
 *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
 * @param expression
 *      The NULL-terminated expression to parse. Note that this string must
 *      represent ONLY the pattern to be matched, with no delimiters or flags;
@ -553,15 +587,27 @@ hs_error_t hs_free_compile_error(hs_compile_error_t *error);
 *      HS_COMPILER_ERROR on failure, with details provided in the error
 *      parameter.
 */
-hs_error_t hs_expression_info(const char *expression, unsigned int flags,
-                              hs_expr_info_t **info,
-                              hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_expression_info(const char *expression,
+                                       unsigned int flags,
+                                       hs_expr_info_t **info,
+                                       hs_compile_error_t **error);

 /**
 * Utility function providing information about a regular expression, with
 * extended parameter support. The information provided in @ref hs_expr_info_t
 * includes the minimum and maximum width of a pattern match.
 *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
 * @param expression
 *      The NULL-terminated expression to parse. Note that this string must
 *      represent ONLY the pattern to be matched, with no delimiters or flags;
@ -608,10 +654,11 @@ hs_error_t hs_expression_info(const char *expression, unsigned int flags,
 *      HS_COMPILER_ERROR on failure, with details provided in the error
 *      parameter.
 */
-hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
-                                  const hs_expr_ext_t *ext,
-                                  hs_expr_info_t **info,
-                                  hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
+                                           unsigned int flags,
+                                           const hs_expr_ext_t *ext,
+                                           hs_expr_info_t **info,
+                                           hs_compile_error_t **error);

 /**
 * Populates the platform information based on the current host.
@ -623,7 +670,7 @@ hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_populate_platform(hs_platform_info_t *platform);
+hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);

 /**
 * @defgroup HS_PATTERN_FLAG Pattern flags
@ -770,6 +817,14 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform);
 */
 #define HS_CPU_FEATURES_AVX2             (1ULL << 2)

+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512)
+ *
+ * Setting this flag indicates that the target platform supports AVX512
+ * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2.
+ */
+#define HS_CPU_FEATURES_AVX512           (1ULL << 3)
+
 /** @} */

 /**
@ -826,6 +881,30 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform);
 */
 #define HS_TUNE_FAMILY_BDW 5

+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKL 6
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKX 7
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Goldmont
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Goldmont microarchitecture.
+ */
+#define HS_TUNE_FAMILY_GLM 8
+
 /** @} */

 /**
--- a/src/hs_runtime.h
+++ b/src/hs_runtime.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -145,8 +145,8 @@ typedef int (*match_event_handler)(unsigned int id,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_open_stream(const hs_database_t *db, unsigned int flags,
-                          hs_stream_t **stream);
+hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db, unsigned int flags,
+                                   hs_stream_t **stream);

 /**
 * Write data to be scanned to the opened stream.
@ -185,10 +185,10 @@ hs_error_t hs_open_stream(const hs_database_t *db, unsigned int flags,
 *      match callback indicated that scanning should stop; other values on
 *      error.
 */
-hs_error_t hs_scan_stream(hs_stream_t *id, const char *data,
-                          unsigned int length, unsigned int flags,
-                          hs_scratch_t *scratch, match_event_handler onEvent,
-                          void *ctxt);
+hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data,
+                                   unsigned int length, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *ctxt);

 /**
 * Close a stream.
@ -223,8 +223,8 @@ hs_error_t hs_scan_stream(hs_stream_t *id, const char *data,
 * @return
 *      Returns @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
-                           match_event_handler onEvent, void *ctxt);
+hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *ctxt);

 /**
 * Reset a stream to an initial state.
@ -264,9 +264,9 @@ hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_reset_stream(hs_stream_t *id, unsigned int flags,
-                           hs_scratch_t *scratch, match_event_handler onEvent,
-                           void *context);
+hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, unsigned int flags,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *context);

 /**
 * Duplicate the given stream. The new stream will have the same state as the
@ -282,7 +282,8 @@ hs_error_t hs_reset_stream(hs_stream_t *id, unsigned int flags,
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id);
+hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id,
+                                   const hs_stream_t *from_id);

 /**
 * Duplicate the given 'from' stream state onto the 'to' stream. The 'to' stream
@ -314,11 +315,11 @@ hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
-                                    const hs_stream_t *from_id,
-                                    hs_scratch_t *scratch,
-                                    match_event_handler onEvent,
-                                    void *context);
+hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                             const hs_stream_t *from_id,
+                                             hs_scratch_t *scratch,
+                                             match_event_handler onEvent,
+                                             void *context);

 /**
 * The block (non-streaming) regular expression scanner.
@ -355,10 +356,10 @@ hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
 *      match callback indicated that scanning should stop; other values on
 *      error.
 */
-hs_error_t hs_scan(const hs_database_t *db, const char *data,
-                   unsigned int length, unsigned int flags,
-                   hs_scratch_t *scratch, match_event_handler onEvent,
-                   void *context);
+hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data,
+                            unsigned int length, unsigned int flags,
+                            hs_scratch_t *scratch, match_event_handler onEvent,
+                            void *context);

 /**
 * The vectored regular expression scanner.
@ -398,10 +399,12 @@ hs_error_t hs_scan(const hs_database_t *db, const char *data,
 *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match
 *      callback indicated that scanning should stop; other values on error.
 */
-hs_error_t hs_scan_vector(const hs_database_t *db, const char *const *data,
-                          const unsigned int *length, unsigned int count,
-                          unsigned int flags, hs_scratch_t *scratch,
-                          match_event_handler onEvent, void *context);
+hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
+                                   const char *const *data,
+                                   const unsigned int *length,
+                                   unsigned int count, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context);

 /**
 * Allocate a "scratch" space for use by Hyperscan.
@ -429,7 +432,8 @@ hs_error_t hs_scan_vector(const hs_database_t *db, const char *const *data,
 *      allocation fails.  Other errors may be returned if invalid parameters
 *      are specified.
 */
-hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch);
+hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db,
+                                     hs_scratch_t **scratch);

 /**
 * Allocate a scratch space that is a clone of an existing scratch space.
@ -449,7 +453,8 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch);
 *      @ref HS_SUCCESS on success; @ref HS_NOMEM if the allocation fails.
 *      Other errors may be returned if invalid parameters are specified.
 */
-hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest);
+hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src,
+                                     hs_scratch_t **dest);

 /**
 * Provides the size of the given scratch space.
@ -465,7 +470,8 @@ hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *scratch_size);
+hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch,
+                                    size_t *scratch_size);

 /**
 * Free a scratch block previously allocated by @ref hs_alloc_scratch() or @ref
@ -480,7 +486,7 @@ hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *scratch_size);
 * @return
 *      @ref HS_SUCCESS on success, other values on failure.
 */
-hs_error_t hs_free_scratch(hs_scratch_t *scratch);
+hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch);

 /**
 * Callback 'from' return value, indicating that the start of this match was
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,7 +30,7 @@
 #include "util/cpuid_flags.h"

 HS_PUBLIC_API
-hs_error_t hs_valid_platform(void) {
+hs_error_t HS_CDECL hs_valid_platform(void) {
    /* Hyperscan requires SSSE3, anything else is a bonus */
    if (check_ssse3()) {
        return HS_SUCCESS;
--- a/src/hs_version.c
+++ b/src/hs_version.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,6 @@
 #include "hs_version.h"

 HS_PUBLIC_API
-const char *hs_version(void) {
+const char * HS_CDECL hs_version(void) {
    return HS_VERSION_STRING;
 }
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -172,6 +172,8 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
 hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
                      size_t start, HWLMCallback cb, void *ctxt,
                      hwlm_group_t groups) {
+    assert(t);
+
    DEBUG_PRINTF("buf len=%zu, start=%zu, groups=%llx\n", len, start, groups);
    if (!groups) {
        DEBUG_PRINTF("groups all off\n");
@ -201,6 +203,9 @@ hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
 hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
                               size_t len, size_t start, HWLMCallback cb,
                               void *ctxt, hwlm_group_t groups) {
+    assert(t);
+    assert(scratch);
+
    const u8 *hbuf = scratch->core_info.hbuf;
    const size_t hlen = scratch->core_info.hlen;
    const u8 *buf = scratch->core_info.buf;
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,31 +29,23 @@
 /** \file
 * \brief Hamster Wheel Literal Matcher: build code.
 */
+
+#include "hwlm_build.h"
+
 #include "grey.h"
 #include "hwlm.h"
-#include "hwlm_build.h"
 #include "hwlm_internal.h"
+#include "hwlm_literal.h"
 #include "noodle_engine.h"
 #include "noodle_build.h"
 #include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
-#include "nfa/shufticompile.h"
-#include "nfa/trufflecompile.h"
-#include "util/alloc.h"
-#include "util/bitutils.h"
-#include "util/charreach.h"
-#include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
-#include "util/dump_charclass.h"
-#include "util/target_info.h"
 #include "util/ue2string.h"
-#include "util/verify_types.h"

 #include <cassert>
-#include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <vector>

@ -61,431 +53,6 @@ using namespace std;

 namespace ue2 {

-static const unsigned int MAX_ACCEL_OFFSET = 16;
-static const unsigned int MAX_SHUFTI_WIDTH = 240;
-
-static
-size_t mask_overhang(const hwlmLiteral &lit) {
-    size_t msk_true_size = lit.msk.size();
-    assert(msk_true_size <= HWLM_MASKLEN);
-    assert(HWLM_MASKLEN <= MAX_ACCEL_OFFSET);
-    for (u8 c : lit.msk) {
-        if (!c) {
-            msk_true_size--;
-        } else {
-            break;
-        }
-    }
-
-    if (lit.s.length() >= msk_true_size) {
-        return 0;
-    }
-
-    /* only short literals should be able to have a mask which overhangs */
-    assert(lit.s.length() < MAX_ACCEL_OFFSET);
-    return msk_true_size - lit.s.length();
-}
-
-static
-bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
-    const hwlmLiteral &first = *lits.front();
-
-    struct candidate {
-        candidate(void)
-            : c1(0), c2(0), max_offset(0), b5insens(false), valid(false) {}
-        candidate(const hwlmLiteral &base, u32 offset)
-            : c1(base.s[offset]), c2(base.s[offset + 1]), max_offset(0),
-              b5insens(false), valid(true) {}
-        char c1;
-        char c2;
-        u32 max_offset;
-        bool b5insens;
-        bool valid;
-
-        bool operator>(const candidate &other) const {
-            if (!valid) {
-                return false;
-            }
-
-            if (!other.valid) {
-                return true;
-            }
-
-            if (other.cdiffers() && !cdiffers()) {
-                return false;
-            }
-
-            if (!other.cdiffers() && cdiffers()) {
-                return true;
-            }
-
-            if (!other.b5insens && b5insens) {
-                return false;
-            }
-
-            if (other.b5insens && !b5insens) {
-                return true;
-            }
-
-            if (max_offset > other.max_offset) {
-                return false;
-            }
-
-            return true;
-        }
-
-        bool cdiffers(void) const {
-            if (!b5insens) {
-                return c1 != c2;
-            }
-            return (c1 & CASE_CLEAR) != (c2 & CASE_CLEAR);
-        }
-    };
-
-    candidate best;
-
-    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()) - 1; i++) {
-        candidate curr(first, i);
-
-        /* check to see if this pair appears in each string */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            if (lit.nocase && (ourisalpha(curr.c1) || ourisalpha(curr.c2))) {
-                curr.b5insens = true; /* no choice but to be case insensitive */
-            }
-
-            bool found = false;
-            bool found_nc = false;
-            for (u32 j = 0;
-                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1; j++) {
-                found |= curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
-                found_nc |= (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
-                    && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
-
-                if (curr.b5insens) {
-                    found = found_nc;
-                }
-            }
-
-            if (!curr.b5insens && !found && found_nc) {
-                curr.b5insens = true;
-                found = true;
-            }
-
-            if (!found) {
-                goto next_candidate;
-            }
-        }
-
-        /* check to find the max offset where this appears */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1;
-                 j++) {
-                bool found = false;
-                if (curr.b5insens) {
-                    found = (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
-                     && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
-                } else {
-                    found = curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
-                }
-
-                if (found) {
-                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
-                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
-                    break;
-                }
-            }
-        }
-
-        if (curr > best) {
-            best = curr;
-        }
-
-    next_candidate:;
-    }
-
-    if (!best.valid) {
-        return false;
-    }
-
-    aux->dverm.offset = verify_u8(best.max_offset);
-
-    if (!best.b5insens) {
-        aux->dverm.accel_type = ACCEL_DVERM;
-        aux->dverm.c1 = best.c1;
-        aux->dverm.c2 = best.c2;
-        DEBUG_PRINTF("built dverm for %02hhx%02hhx\n",
-                     aux->dverm.c1, aux->dverm.c2);
-    } else {
-        aux->dverm.accel_type = ACCEL_DVERM_NOCASE;
-        aux->dverm.c1 = best.c1 & CASE_CLEAR;
-        aux->dverm.c2 = best.c2 & CASE_CLEAR;
-        DEBUG_PRINTF("built dverm nc for %02hhx%02hhx\n",
-                     aux->dverm.c1, aux->dverm.c2);
-    }
-    return true;
-}
-
-static
-bool findSVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
-    const hwlmLiteral &first = *lits.front();
-
-    struct candidate {
-        candidate(void)
-            : c(0), max_offset(0), b5insens(false), valid(false) {}
-        candidate(const hwlmLiteral &base, u32 offset)
-            : c(base.s[offset]), max_offset(0),
-              b5insens(false), valid(true) {}
-        char c;
-        u32 max_offset;
-        bool b5insens;
-        bool valid;
-
-        bool operator>(const candidate &other) const {
-            if (!valid) {
-                return false;
-            }
-
-            if (!other.valid) {
-                return true;
-            }
-
-            if (!other.b5insens && b5insens) {
-                return false;
-            }
-
-            if (other.b5insens && !b5insens) {
-                return true;
-            }
-
-            if (max_offset > other.max_offset) {
-                return false;
-            }
-
-            return true;
-        }
-    };
-
-    candidate best;
-
-    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()); i++) {
-        candidate curr(first, i);
-
-        /* check to see if this pair appears in each string */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            if (lit.nocase && ourisalpha(curr.c)) {
-                curr.b5insens = true; /* no choice but to be case insensitive */
-            }
-
-            bool found = false;
-            bool found_nc = false;
-            for (u32 j = 0;
-                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
-                found |= curr.c == lit.s[j];
-                found_nc |= (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
-
-                if (curr.b5insens) {
-                    found = found_nc;
-                }
-            }
-
-            if (!curr.b5insens && !found && found_nc) {
-                curr.b5insens = true;
-                found = true;
-            }
-
-            if (!found) {
-                goto next_candidate;
-            }
-        }
-
-        /* check to find the max offset where this appears */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
-                bool found = false;
-                if (curr.b5insens) {
-                    found = (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
-                } else {
-                    found = curr.c == lit.s[j];
-                }
-
-                if (found) {
-                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
-                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
-                }
-            }
-        }
-
-        if (curr > best) {
-            best = curr;
-        }
-
-    next_candidate:;
-    }
-
-    if (!best.valid) {
-        return false;
-    }
-
-    if (!best.b5insens) {
-        aux->verm.accel_type = ACCEL_VERM;
-        aux->verm.c = best.c;
-        DEBUG_PRINTF("built verm for %02hhx\n", aux->verm.c);
-    } else {
-        aux->verm.accel_type = ACCEL_VERM_NOCASE;
-        aux->verm.c = best.c & CASE_CLEAR;
-        DEBUG_PRINTF("built verm nc for %02hhx\n", aux->verm.c);
-    }
-    aux->verm.offset = verify_u8(best.max_offset);
-
-    return true;
-}
-
-static
-void filterLits(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups,
-                vector<const hwlmLiteral *> *filtered_lits, u32 *min_len) {
-    *min_len = MAX_ACCEL_OFFSET;
-
-    for (const auto &lit : lits) {
-        if (!(lit.groups & expected_groups)) {
-            continue;
-        }
-
-        const size_t lit_len = lit.s.length();
-        if (lit_len < *min_len) {
-            *min_len = verify_u32(lit_len);
-        }
-
-        filtered_lits->push_back(&lit);
-
-#ifdef DEBUG
-        DEBUG_PRINTF("lit:");
-        for (u32 i = 0; i < lit.s.length(); i++) {
-            printf("%02hhx", lit.s[i]);
-        }
-        printf("\n");
-#endif
-    }
-}
-
-static
-bool litGuardedByCharReach(const CharReach &cr, const hwlmLiteral &lit,
-                           u32 max_offset) {
-    for (u32 i = 0; i <= max_offset && i < lit.s.length(); i++) {
-         unsigned char c = lit.s[i];
-         if (lit.nocase) {
-             if (cr.test(mytoupper(c)) && cr.test(mytolower(c))) {
-                 return true;
-             }
-         } else {
-             if (cr.test(c)) {
-                 return true;
-             }
-         }
-    }
-
-    return false;
-}
-
-static
-void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
-                            hwlm_group_t expected_groups, AccelAux *aux) {
-    DEBUG_PRINTF("building accel expected=%016llx\n", expected_groups);
-    u32 min_len = MAX_ACCEL_OFFSET;
-    vector<const hwlmLiteral *> filtered_lits;
-
-    filterLits(lits, expected_groups, &filtered_lits, &min_len);
-    if (filtered_lits.empty()) {
-        return;
-    }
-
-    if (findDVerm(filtered_lits, aux)
-        || findSVerm(filtered_lits, aux)) {
-        return;
-    }
-
-    /* look for shufti/truffle */
-
-    vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
-    for (const auto &lit : lits) {
-        if (!(lit.groups & expected_groups)) {
-            continue;
-        }
-
-        u32 overhang = mask_overhang(lit);
-        for (u32 i = 0; i < overhang; i++) {
-            /* this offset overhangs the start of the real literal; look at the
-             * msk/cmp */
-            for (u32 j = 0; j < N_CHARS; j++) {
-                if ((j & lit.msk[i]) == lit.cmp[i]) {
-                    reach[i].set(j);
-                }
-            }
-        }
-        for (u32 i = overhang; i < MAX_ACCEL_OFFSET; i++) {
-            CharReach &reach_i = reach[i];
-            u32 i_effective = i - overhang;
-
-            if (litGuardedByCharReach(reach_i, lit, i_effective)) {
-                continue;
-            }
-            unsigned char c = i_effective < lit.s.length() ? lit.s[i_effective]
-                                                           : lit.s.back();
-            if (lit.nocase) {
-                reach_i.set(mytoupper(c));
-                reach_i.set(mytolower(c));
-            } else {
-                reach_i.set(c);
-            }
-        }
-    }
-
-    u32 min_count = ~0U;
-    u32 min_offset = ~0U;
-    for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
-        size_t count = reach[i].count();
-        DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
-                     describeClass(reach[i]).c_str(), count);
-        if (count < min_count) {
-            min_count = (u32)count;
-            min_offset = i;
-        }
-    }
-
-    if (min_count > MAX_SHUFTI_WIDTH) {
-        DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);
-        return;
-    }
-
-    const CharReach &cr = reach[min_offset];
-    if (-1 !=
-        shuftiBuildMasks(cr, (u8 *)&aux->shufti.lo, (u8 *)&aux->shufti.hi)) {
-        DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
-                     describeClass(cr).c_str(), cr.count(), min_offset);
-        aux->shufti.accel_type = ACCEL_SHUFTI;
-        aux->shufti.offset = verify_u8(min_offset);
-        return;
-    }
-
-    truffleBuildMasks(cr, (u8 *)&aux->truffle.mask1, (u8 *)&aux->truffle.mask2);
-    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
-                 describeClass(cr).c_str(), cr.count(), min_offset);
-    aux->truffle.accel_type = ACCEL_TRUFFLE;
-    aux->truffle.offset = verify_u8(min_offset);
-}
-
-static
-void buildForwardAccel(HWLM *h, const vector<hwlmLiteral> &lits,
-                       hwlm_group_t expected_groups) {
-    findForwardAccelScheme(lits, expected_groups, &h->accel1);
-    findForwardAccelScheme(lits, HWLM_ALL_GROUPS, &h->accel0);
-
-    h->accel1_groups = expected_groups;
-}
-
 static
 void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
 #ifdef DEBUG
@ -512,7 +79,6 @@ bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {

 static
 bool isNoodleable(const vector<hwlmLiteral> &lits,
-                  const hwlmStreamingControl *stream_control,
                  const CompileContext &cc) {
    if (!cc.grey.allowNoodle) {
        return false;
@ -523,19 +89,6 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
        return false;
    }

-    if (stream_control) { // nullptr if in block mode
-        if (lits.front().s.length() > stream_control->history_max + 1) {
-            DEBUG_PRINTF("length of %zu too long for history max %zu\n",
-                         lits.front().s.length(),
-                         stream_control->history_max);
-            return false;
-        }
-        if (2 * lits.front().s.length() - 2 > FDR_TEMP_BUF_SIZE) {
-            assert(0);
-            return false;
-        }
-    }
-
    if (!lits.front().msk.empty()) {
        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
        return false;
@ -544,23 +97,12 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
    return true;
 }

-aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
-                                   hwlmStreamingControl *stream_control,
-                                   bool make_small, const CompileContext &cc,
-                                   hwlm_group_t expected_groups) {
+bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
+                             const CompileContext &cc,
+                             UNUSED hwlm_group_t expected_groups) {
    assert(!lits.empty());
    dumpLits(lits);

-    if (stream_control) {
-        assert(stream_control->history_min <= stream_control->history_max);
-
-        // We should not have been passed any literals that are too long to
-        // match with a maximally-sized history buffer.
-        assert(all_of(begin(lits), end(lits), [&](const hwlmLiteral &lit) {
-            return lit.s.length() <= stream_control->history_max + 1;
-        }));
-    }
-
    // Check that we haven't exceeded the maximum number of literals.
    if (lits.size() > cc.grey.limitLiteralCount) {
        throw ResourceLimitError();
@ -595,29 +137,21 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,

    assert(everyoneHasGroups(lits));

-    if (isNoodleable(lits, stream_control, cc)) {
+    if (isNoodleable(lits, cc)) {
        DEBUG_PRINTF("build noodle table\n");
        engType = HWLM_ENGINE_NOOD;
        const hwlmLiteral &lit = lits.front();
        auto noodle = noodBuildTable(lit);
        if (noodle) {
-            engSize = noodSize(noodle.get());
-        }
-        if (stream_control) {
-            // For now, a single literal still goes to noodle and asks
-            // for a great big history
-            stream_control->literal_history_required = lit.s.length() - 1;
-            assert(stream_control->literal_history_required
-                   <= stream_control->history_max);
+            engSize = noodle.size();
        }
        eng = move(noodle);
    } else {
        DEBUG_PRINTF("building a new deal\n");
        engType = HWLM_ENGINE_FDR;
-        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey,
-                            stream_control);
+        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey);
        if (fdr) {
-            engSize = fdrSize(fdr.get());
+            engSize = fdr.size();
        }
        eng = move(fdr);
    }
@ -631,23 +165,12 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
        throw ResourceLimitError();
    }

-    auto h = aligned_zmalloc_unique<HWLM>(ROUNDUP_CL(sizeof(HWLM)) + engSize);
+    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
+    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);

    h->type = engType;
    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);

-    if (engType == HWLM_ENGINE_FDR && cc.grey.hamsterAccelForward) {
-        buildForwardAccel(h.get(), lits, expected_groups);
-    }
-
-    if (stream_control) {
-        DEBUG_PRINTF("requires %zu (of max %zu) bytes of history\n",
-                     stream_control->literal_history_required,
-                     stream_control->history_max);
-        assert(stream_control->literal_history_required
-                    <= stream_control->history_max);
-    }
-
    return h;
 }

--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,9 +34,8 @@
 #define HWLM_BUILD_H

 #include "hwlm.h"
-#include "hwlm_literal.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"

 #include <memory>
 #include <vector>
@ -47,30 +46,12 @@ namespace ue2 {

 struct CompileContext;
 struct Grey;
-struct target_t;
-
-/** \brief Structure gathering together the input/output parameters related to
- * streaming mode operation. */
-struct hwlmStreamingControl {
-    /** \brief IN parameter: Upper limit on the amount of history that can be
-     * requested. */
-    size_t history_max;
-
-    /** \brief IN parameter: History already known to be used before literal
-     * analysis. */
-    size_t history_min;
-
-    /** \brief OUT parameter: History required by the literal matcher to
-     * correctly match all literals. */
-    size_t literal_history_required;
-};
+struct hwlmLiteral;

 /** \brief Build an \ref HWLM literal matcher runtime structure for a group of
 * literals.
 *
 * \param lits The group of literals.
- * \param stream_control Streaming control parameters. If the matcher will
- *        operate in non-streaming (block) mode, this pointer should be NULL.
 * \param make_small Optimise matcher for small size.
 * \param cc Compile context.
 * \param expected_groups FIXME: document me!
@ -79,11 +60,9 @@ struct hwlmStreamingControl {
 * may result in a nullptr return value, or a std::bad_alloc exception being
 * thrown.
 */
-aligned_unique_ptr<HWLM>
-hwlmBuild(const std::vector<hwlmLiteral> &lits,
-          hwlmStreamingControl *stream_control, bool make_small,
-          const CompileContext &cc,
-          hwlm_group_t expected_groups = HWLM_ALL_GROUPS);
+bytecode_ptr<HWLM> hwlmBuild(const std::vector<hwlmLiteral> &lits,
+                             bool make_small, const CompileContext &cc,
+                             hwlm_group_t expected_groups = HWLM_ALL_GROUPS);

 /**
 * Returns an estimate of the number of repeated characters on the end of a
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,12 +37,13 @@
 #include "ue2common.h"

 #include <string>
+#include <tuple>
 #include <vector>

 namespace ue2 {

 /** \brief Max length of the literal passed to HWLM. */
-#define HWLM_LITERAL_MAX_LEN 255
+#define HWLM_LITERAL_MAX_LEN 8

 /** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
 #define HWLM_MASKLEN 8
@ -111,6 +112,19 @@ struct hwlmLiteral {
        : hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {}
 };

+inline
+bool operator<(const hwlmLiteral &a, const hwlmLiteral &b) {
+    return std::tie(a.id, a.s, a.nocase, a.noruns, a.groups, a.msk, a.cmp) <
+           std::tie(b.id, b.s, b.nocase, b.noruns, b.groups, b.msk, b.cmp);
+}
+
+inline
+bool operator==(const hwlmLiteral &a, const hwlmLiteral &b) {
+    return a.id == b.id && a.s == b.s && a.nocase == b.nocase &&
+           a.noruns == b.noruns && a.groups == b.groups && a.msk == b.msk &&
+           a.cmp == b.cmp;
+}
+
 /**
 * Consistency test; returns false if the given msk/cmp test can never match
 * the literal string s.
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -35,7 +35,6 @@

 #include "hwlm_literal.h"
 #include "noodle_internal.h"
-#include "util/alloc.h"
 #include "util/compare.h"
 #include "util/verify_types.h"
 #include "ue2common.h"
@ -67,7 +66,7 @@ size_t findNoodFragOffset(const hwlmLiteral &lit) {
    return offset;
 }

-aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
+bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
    if (!lit.msk.empty()) {
        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
        return nullptr;
@ -75,7 +74,7 @@ aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {

    const auto &s = lit.s;
    size_t noodle_len = sizeof(noodTable) + s.length();
-    auto n = aligned_zmalloc_unique<noodTable>(noodle_len);
+    auto n = make_zeroed_bytecode_ptr<noodTable>(noodle_len);
    assert(n);

    size_t key_offset = findNoodFragOffset(lit);
--- a/src/hwlm/noodle_build.h
+++ b/src/hwlm/noodle_build.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,11 +30,11 @@
 * \brief Noodle literal matcher: build code.
 */

-#ifndef NOODLE_BUILD_H_048A1A6D585A9A
-#define NOODLE_BUILD_H_048A1A6D585A9A
+#ifndef NOODLE_BUILD_H
+#define NOODLE_BUILD_H

 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"

 struct noodTable;

@ -43,7 +43,7 @@ namespace ue2 {
 struct hwlmLiteral;

 /** \brief Construct a Noodle matcher for the given literal. */
-ue2::aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);
+bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);

 size_t noodSize(const noodTable *n);

@ -61,5 +61,5 @@ void noodPrintStats(const noodTable *n, FILE *f);

 #endif // DUMP_SUPPORT

-#endif /* NOODLE_BUILD_H_048A1A6D585A9A */
+#endif /* NOODLE_BUILD_H */

--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,8 +33,11 @@
 #include "noodle_engine.h"
 #include "noodle_internal.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
+#include "util/intrinsics.h"
+#include "util/join.h"
 #include "util/masked_move.h"
 #include "util/simd_utils.h"

@ -50,6 +53,24 @@ struct cb_info {
    size_t offsetAdj; //!< used in streaming mode
 };

+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#define Z_BITS 64
+#define Z_TYPE u64a
+#elif defined(HAVE_AVX2)
+#define CHUNKSIZE 32
+#define MASK_TYPE m256
+#define Z_BITS 32
+#define Z_TYPE u32
+#else
+#define CHUNKSIZE 16
+#define MASK_TYPE m128
+#define Z_BITS 32
+#define Z_TYPE u32
+#endif
+
+
 #define RETURN_IF_TERMINATED(x)                                                \
    {                                                                          \
        if ((x) == HWLM_TERMINATED) {                                          \
@ -60,8 +81,9 @@ struct cb_info {
 #define SINGLE_ZSCAN()                                                         \
    do {                                                                       \
        while (unlikely(z)) {                                                  \
-            u32 pos = findAndClearLSB_32(&z);                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
            size_t matchPos = d - buf + pos;                                   \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
            hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi,        \
                                   matchPos);                                  \
            RETURN_IF_TERMINATED(rv);                                          \
@ -71,8 +93,9 @@ struct cb_info {
 #define DOUBLE_ZSCAN()                                                         \
    do {                                                                       \
        while (unlikely(z)) {                                                  \
-            u32 pos = findAndClearLSB_32(&z);                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
            size_t matchPos = d - buf + pos - 1;                               \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
            hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1,        \
                                   noCase, cbi, matchPos);                     \
            RETURN_IF_TERMINATED(rv);                                          \
@ -109,7 +132,11 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
    return HWLM_SUCCESS;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#include "noodle_engine_avx512.c"
+#elif defined(HAVE_AVX2)
 #define CHUNKSIZE 32
 #define MASK_TYPE m256
 #include "noodle_engine_avx2.c"
@ -122,12 +149,14 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
 static really_inline
 hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
                            bool noCase, const struct cb_info *cbi) {
-    hwlm_error_t rv;
-    size_t end = len;

    const MASK_TYPE mask1 = getMask(key[0], noCase);
    const MASK_TYPE caseMask = getCaseMask();

+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+    size_t end = len;
+
    if (len < CHUNKSIZE) {
        rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len);
        return rv;
@ -172,13 +201,15 @@ hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
                             cbi, s2End, end);

    return rv;
+#else // HAVE_AVX512
+    return scanSingle512(buf, len, key, noCase, caseMask, mask1, cbi);
+#endif
 }

 static really_inline
 hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
                            size_t keyLen, size_t keyOffset, bool noCase,
                            const struct cb_info *cbi) {
-    hwlm_error_t rv;
    // we stop scanning for the key-fragment when the rest of the key can't
    // possibly fit in the remaining buffer
    size_t end = len - keyLen + keyOffset + 2;
@ -187,6 +218,9 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
    const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase);
    const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase);

+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+
    if (end - keyOffset < CHUNKSIZE) {
        rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
                             mask1, mask2, cbi, keyOffset, end);
@ -243,6 +277,10 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
                             caseMask, mask1, mask2, cbi, off, end);

    return rv;
+#else // AVX512
+    return scanDouble512(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                         mask1, mask2, cbi, keyOffset, end);
+#endif // AVX512
 }


--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -117,9 +117,9 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
    if (l < 4) {
        u8 *vp = (u8*)&v;
        switch (l) {
-            case 3: vp[2] = d[2];
-            case 2: vp[1] = d[1];
-            case 1: vp[0] = d[0];
+            case 3: vp[2] = d[2]; // fallthrough
+            case 2: vp[1] = d[1]; // fallthrough
+            case 1: vp[0] = d[0]; // fallthrough
        }
    } else {
        v = masked_move256_len(d, l);
@ -157,9 +157,9 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
    if (l < 4) {
        u8 *vp = (u8*)&v;
        switch (l) {
-            case 3: vp[2] = d[2];
-            case 2: vp[1] = d[1];
-            case 1: vp[0] = d[0];
+            case 3: vp[2] = d[2]; // fallthrough
+            case 2: vp[1] = d[1]; // fallthrough
+            case 1: vp[0] = d[0]; // fallthrough
        }
    } else {
        v = masked_move256_len(d, l);
--- a/src/hwlm/noodle_engine_avx512.c
+++ b/src/hwlm/noodle_engine_avx512.c
@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* noodle scan parts for AVX512 */
+
+static really_inline
+m512 getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return set64x8(k);
+}
+
+static really_inline
+m512 getCaseMask(void) {
+    return set64x8(CASE_CLEAR);
+}
+
+// The short scan routine. It is used both to scan data up to an
+// alignment boundary if needed and to finish off data that the aligned scan
+// function can't handle (due to small/unaligned chunk at end)
+static really_inline
+hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
+                             bool noCase, m512 caseMask, m512 mask1,
+                             const struct cb_info *cbi, size_t start,
+                             size_t end) {
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    DEBUG_PRINTF("scan_len %zu\n", scan_len);
+    assert(scan_len <= 64);
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 v = loadu_maskz_m512(k, d);
+
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    // reuse the load mask to indicate valid bytes
+    u64a z = masked_eq512mask(k, mask1, v);
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,
+                           bool noCase, m512 caseMask, m512 mask1,
+                           const struct cb_info *cbi) {
+    const u8 *d = buf;
+    const u8 *e = buf + len;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0,
+                        d1 - d) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p \n", d, e);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        u64a z = eq512mask(mask1, v);
+        __builtin_prefetch(d + 128);
+
+        SINGLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    // finish off tail
+
+    return scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, d - buf,
+                           e - buf);
+}
+
+static really_inline
+hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
+                             size_t keyLen, size_t keyOffset, bool noCase,
+                             m512 caseMask, m512 mask1, m512 mask2,
+                             const struct cb_info *cbi, u64a *lastz0,
+                             size_t start, size_t end) {
+    DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+    assert(scan_len <= 64);
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
+
+    m512 v = loadu_maskz_m512(k, d);
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    u64a z0 = masked_eq512mask(k, mask1, v);
+    u64a z1 = masked_eq512mask(k, mask2, v);
+    u64a z = (*lastz0 | (z0 << 1)) & z1;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+
+    DOUBLE_ZSCAN();
+    *lastz0 = z0 >> (scan_len - 1);
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,
+                           size_t keyLen, size_t keyOffset, bool noCase,
+                           m512 caseMask, m512 mask1, m512 mask2,
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    u64a lastz0 = 0;
+    DEBUG_PRINTF("start %zu end %zu \n", start, end);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                        mask1, mask2, cbi, &lastz0, start,
+                        d1 - buf) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        /* we have to pull the masks out of the AVX registers because we can't
+           byte shift between the lanes */
+        u64a z0 = eq512mask(mask1, v);
+        u64a z1 = eq512mask(mask2, v);
+        u64a z = (lastz0 | (z0 << 1)) & z1;
+        lastz0 = z0 >> 63;
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 256);
+
+        DEBUG_PRINTF("z 0x%016llx\n", z);
+
+        DOUBLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
+    // finish off tail
+
+    return scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                           mask1, mask2, cbi, &lastz0, d - buf, end);
+}
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,9 +30,6 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "vermicelli.h"
-#include "multishufti.h"
-#include "multitruffle.h"
-#include "multivermicelli.h"
 #include "ue2common.h"

 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@ -132,220 +129,6 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
        rv = c_end;
        break;

-    /* multibyte matchers */
-    case ACCEL_MLVERM:
-        DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLVERM_NOCASE:
-        DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLGVERM:
-        DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLGVERM_NOCASE:
-        DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSVERM:
-        DEBUG_PRINTF("accel msverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSVERM_NOCASE:
-        DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSGVERM:
-        DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSGVERM_NOCASE:
-        DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MDSVERM:
-        DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end,
-                                        accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSVERM_NOCASE:
-        DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end,
-                                        accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSGVERM:
-        DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end,
-                                            accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSGVERM_NOCASE:
-        DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end,
-                                            accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MLSHUFTI:
-        DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                             accel->mshufti.len);
-        break;
-    case ACCEL_MLGSHUFTI:
-        DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                                 accel->mshufti.len);
-        break;
-    case ACCEL_MSSHUFTI:
-        DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                              accel->mshufti.len);
-        break;
-    case ACCEL_MSGSHUFTI:
-        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                                  accel->mshufti.len);
-        break;
-    case ACCEL_MDSSHUFTI:
-        DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
-                                     accel->mdshufti.len1, accel->mdshufti.len2);
-        break;
-    case ACCEL_MDSGSHUFTI:
-        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
-                                         accel->mdshufti.len1, accel->mdshufti.len2);
-        break;
-    case ACCEL_MLTRUFFLE:
-        DEBUG_PRINTF("accel mltruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                               c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MLGTRUFFLE:
-        DEBUG_PRINTF("accel mlgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                                   c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MSTRUFFLE:
-        DEBUG_PRINTF("accel mstruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                               c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MSGTRUFFLE:
-        DEBUG_PRINTF("accel msgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                                   c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MDSTRUFFLE:
-        DEBUG_PRINTF("accel mdstruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_truffleExec(accel->mdtruffle.mask1,
-                                     accel->mdtruffle.mask2, c, c_end,
-                                     accel->mdtruffle.len1,
-                                     accel->mdtruffle.len2);
-        break;
-    case ACCEL_MDSGTRUFFLE:
-        DEBUG_PRINTF("accel mdsgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_truffleExec(accel->mdtruffle.mask1,
-                                         accel->mdtruffle.mask2, c, c_end,
-                                         accel->mdtruffle.len1,
-                                         accel->mdtruffle.len2);
-        break;
-

    default:
        assert(!"not here");
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -61,36 +61,7 @@ enum AccelType {
    ACCEL_DSHUFTI,
    ACCEL_TRUFFLE,
    ACCEL_RED_TAPE,
-    /* multibyte vermicellis */
-    ACCEL_MLVERM,
-    ACCEL_MLVERM_NOCASE,
-    ACCEL_MLGVERM,
-    ACCEL_MLGVERM_NOCASE,
-    ACCEL_MSVERM,
-    ACCEL_MSVERM_NOCASE,
-    ACCEL_MSGVERM,
-    ACCEL_MSGVERM_NOCASE,
-    ACCEL_MDSVERM,
-    ACCEL_MDSVERM_NOCASE,
-    ACCEL_MDSGVERM,
-    ACCEL_MDSGVERM_NOCASE,
-    /* multibyte shuftis */
-    ACCEL_MLSHUFTI,
-    ACCEL_MLGSHUFTI,
-    ACCEL_MSSHUFTI,
-    ACCEL_MSGSHUFTI,
-    ACCEL_MDSSHUFTI,
-    ACCEL_MDSGSHUFTI,
-    /* multibyte truffles */
-    ACCEL_MLTRUFFLE,
-    ACCEL_MLGTRUFFLE,
-    ACCEL_MSTRUFFLE,
-    ACCEL_MSGTRUFFLE,
-    ACCEL_MDSTRUFFLE,
-    ACCEL_MDSGTRUFFLE,
-    /* masked dverm */
    ACCEL_DVERM_MASKED,
-
 };

 /** \brief Structure for accel framework. */
@ -140,42 +111,12 @@ union AccelAux {
        m128 lo2;
        m128 hi2;
    } dshufti;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 lo;
-        m128 hi;
-        u8 len;
-    } mshufti;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 lo;
-        m128 hi;
-        u8 len1;
-        u8 len2;
-    } mdshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
    } truffle;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 mask1;
-        m128 mask2;
-        u8 len;
-    } mtruffle;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 mask1;
-        m128 mask2;
-        u8 len1;
-        u8 len2;
-    } mdtruffle;
 };

 /**
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,9 +33,11 @@
 #include "nfagraph/ng_limex_accel.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
+#include "util/accel_scheme.h"
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/small_vector.h"
 #include "util/verify_types.h"

 #include <sstream>
@ -49,16 +51,15 @@ namespace ue2 {

 namespace {
 struct path {
-    vector<CharReach> reach;
+    small_vector<CharReach, MAX_ACCEL_DEPTH + 1> reach;
    dstate_id_t dest = DEAD_STATE;
-    explicit path(dstate_id_t base) : dest(base) {
-    }
+    explicit path(dstate_id_t base) : dest(base) {}
 };
 };

-static
-void dump_paths(const vector<path> &paths) {
-    for (UNUSED const auto &p : paths) {
+template<typename Container>
+void dump_paths(const Container &paths) {
+    for (UNUSED const path &p : paths) {
        DEBUG_PRINTF("[%s] -> %u\n", describeClasses(p.reach).c_str(), p.dest);
    }
    DEBUG_PRINTF("%zu paths\n", paths.size());
@ -113,17 +114,17 @@ void extend(const raw_dfa &rdfa, const path &p,
        } else {
            path pp = append(p, CharReach(), p.dest);
            all[p.dest].push_back(pp);
-            out.push_back(pp);
+            out.push_back(move(pp));
        }
    }

    if (!s.reports_eod.empty()) {
        path pp = append(p, CharReach(), p.dest);
        all[p.dest].push_back(pp);
-        out.push_back(pp);
+        out.push_back(move(pp));
    }

-    map<u32, CharReach> dest;
+    flat_map<u32, CharReach> dest;
    for (unsigned i = 0; i < N_CHARS; i++) {
        u32 succ = s.next[rdfa.alpha_remap[i]];
        dest[succ].set(i);
@ -140,7 +141,7 @@ void extend(const raw_dfa &rdfa, const path &p,
        DEBUG_PRINTF("----good: [%s] -> %u\n",
                     describeClasses(pp.reach).c_str(), pp.dest);
        all[e.first].push_back(pp);
-        out.push_back(pp);
+        out.push_back(move(pp));
    }
 }

@ -162,8 +163,10 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
    dump_paths(paths);

    vector<vector<CharReach>> rv;
+    rv.reserve(paths.size());
    for (auto &p : paths) {
-        rv.push_back(move(p.reach));
+        rv.push_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
+                                       std::make_move_iterator(p.reach.end())));
    }
    return rv;
 }
@ -327,7 +330,7 @@ accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
    const dstate &raw = rdfa.states[this_idx];
    const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
    bool outs2_broken = false;
-    map<dstate_id_t, CharReach> succs;
+    flat_map<dstate_id_t, CharReach> succs;

    for (u32 i = 0; i < rev_map.size(); i++) {
        if (raw.next[i] == this_idx) {
@ -379,16 +382,18 @@ accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
                    for (auto jj = cr_all_j.find_first(); jj != CharReach::npos;
                         jj = cr_all_j.find_next(jj)) {
                        rv.double_byte.emplace((u8)ii, (u8)jj);
+                        if (rv.double_byte.size() > 8) {
+                            DEBUG_PRINTF("outs2 too big\n");
+                            outs2_broken = true;
+                            goto done;
+                        }
                    }
                }
            }
        }

-        if (rv.double_byte.size() > 8) {
-            DEBUG_PRINTF("outs2 too big\n");
-            outs2_broken = true;
-        }
-
+    done:
+        assert(outs2_broken || rv.double_byte.size() <= 8);
        if (outs2_broken) {
            rv.double_byte.clear();
        }
@ -536,17 +541,17 @@ accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
    DEBUG_PRINTF("sds %hu\n", sds_proxy);

-    for (size_t i = 0; i < rdfa.states.size(); i++) {
+    /* Find accel info for a single state. */
+    auto do_state = [&](size_t i) {
        if (i == DEAD_STATE) {
-            continue;
+            return;
        }

        /* Note on report acceleration states: While we can't accelerate while
-         * we
-         * are spamming out callbacks, the QR code paths don't raise reports
+         * we are spamming out callbacks, the QR code paths don't raise reports
         * during scanning so they can accelerate report states. */
        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
-            continue;
+            return;
        }

        size_t single_limit =
@ -557,15 +562,28 @@ accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
        if (ei.cr.count() > single_limit) {
            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
                         ei.cr.count());
-            continue;
+            return;
        }

        DEBUG_PRINTF("state %zu should be accelerable %zu\n", i, ei.cr.count());

        rv[i] = ei;
+    };
+
+    if (only_accel_init) {
+        DEBUG_PRINTF("only computing accel for init states\n");
+        do_state(rdfa.start_anchored);
+        if (rdfa.start_floating != rdfa.start_anchored) {
+            do_state(rdfa.start_floating);
+        }
+    } else {
+        DEBUG_PRINTF("computing accel for all states\n");
+        for (size_t i = 0; i < rdfa.states.size(); i++) {
+            do_state(i);
+        }
    }

-    /* provide accleration states to states in the region of sds */
+    /* provide acceleration states to states in the region of sds */
    if (contains(rv, sds_proxy)) {
        AccelScheme sds_ei = rv[sds_proxy];
        sds_ei.double_byte.clear(); /* region based on single byte scheme
--- a/src/nfa/accel_dfa_build_strat.h
+++ b/src/nfa/accel_dfa_build_strat.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -43,8 +43,8 @@ struct Grey;

 class accel_dfa_build_strat : public dfa_build_strat {
 public:
-    explicit accel_dfa_build_strat(const ReportManager &rm_in)
-        : dfa_build_strat(rm_in) {}
+    accel_dfa_build_strat(const ReportManager &rm_in, bool only_accel_init_in)
+        : dfa_build_strat(rm_in), only_accel_init(only_accel_init_in) {}
    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const;
    virtual size_t accelSize(void) const = 0;
    virtual u32 max_allowed_offset_accel() const = 0;
@ -53,6 +53,8 @@ public:
    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                            void *accel_out);
    virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey);
+private:
+    bool only_accel_init;
 };

 } // namespace ue2
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -93,54 +93,6 @@ const char *accelName(u8 accel_type) {
        return "truffle";
    case ACCEL_RED_TAPE:
        return "red tape";
-    case ACCEL_MLVERM:
-        return "multibyte long vermicelli";
-    case ACCEL_MLVERM_NOCASE:
-        return "multibyte long vermicelli nocase";
-    case ACCEL_MLGVERM:
-        return "multibyte long-grab vermicelli";
-    case ACCEL_MLGVERM_NOCASE:
-        return "multibyte long-grab vermicelli nocase";
-    case ACCEL_MSVERM:
-        return "multibyte shift vermicelli";
-    case ACCEL_MSVERM_NOCASE:
-        return "multibyte shift vermicelli nocase";
-    case ACCEL_MSGVERM:
-        return "multibyte shift-grab vermicelli";
-    case ACCEL_MSGVERM_NOCASE:
-        return "multibyte shift-grab vermicelli nocase";
-    case ACCEL_MDSVERM:
-        return "multibyte doubleshift vermicelli";
-    case ACCEL_MDSVERM_NOCASE:
-        return "multibyte doubleshift vermicelli nocase";
-    case ACCEL_MDSGVERM:
-        return "multibyte doubleshift-grab vermicelli";
-    case ACCEL_MDSGVERM_NOCASE:
-        return "multibyte doubleshift-grab vermicelli nocase";
-    case ACCEL_MLSHUFTI:
-        return "multibyte long shufti";
-    case ACCEL_MLGSHUFTI:
-        return "multibyte long-grab shufti";
-    case ACCEL_MSSHUFTI:
-        return "multibyte shift shufti";
-    case ACCEL_MSGSHUFTI:
-        return "multibyte shift-grab shufti";
-    case ACCEL_MDSSHUFTI:
-        return "multibyte doubleshift shufti";
-    case ACCEL_MDSGSHUFTI:
-        return "multibyte doubleshift-grab shufti";
-    case ACCEL_MLTRUFFLE:
-        return "multibyte long truffle";
-    case ACCEL_MLGTRUFFLE:
-        return "multibyte long-grab truffle";
-    case ACCEL_MSTRUFFLE:
-        return "multibyte shift truffle";
-    case ACCEL_MSGTRUFFLE:
-        return "multibyte shift-grab truffle";
-    case ACCEL_MDSTRUFFLE:
-        return "multibyte doubleshift truffle";
-    case ACCEL_MDSGTRUFFLE:
-        return "multibyte doubleshift-grab truffle";
    default:
        return "unknown!";
    }
@ -283,59 +235,6 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
                             (const u8 *)&accel.truffle.mask2);
        break;
    }
-    case ACCEL_MLVERM:
-    case ACCEL_MLVERM_NOCASE:
-    case ACCEL_MLGVERM:
-    case ACCEL_MLGVERM_NOCASE:
-    case ACCEL_MSVERM:
-    case ACCEL_MSVERM_NOCASE:
-    case ACCEL_MSGVERM:
-    case ACCEL_MSGVERM_NOCASE:
-        fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len);
-        break;
-    case ACCEL_MDSVERM:
-    case ACCEL_MDSVERM_NOCASE:
-    case ACCEL_MDSGVERM:
-    case ACCEL_MDSGVERM_NOCASE:
-        fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1,
-                accel.mdverm.len2);
-        break;
-    case ACCEL_MLSHUFTI:
-    case ACCEL_MLGSHUFTI:
-    case ACCEL_MSSHUFTI:
-    case ACCEL_MSGSHUFTI:
-        fprintf(f, " len:%u\n", accel.mshufti.len);
-        dumpShuftiMasks(f, (const u8 *)&accel.mshufti.lo,
-                        (const u8 *)&accel.mshufti.hi);
-        dumpShuftiCharReach(f, (const u8 *)&accel.mshufti.lo,
-                            (const u8 *)&accel.mshufti.hi);
-        break;
-    case ACCEL_MDSSHUFTI:
-    case ACCEL_MDSGSHUFTI:
-        fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
-        dumpShuftiMasks(f, (const u8 *)&accel.mdshufti.lo,
-                        (const u8 *)&accel.mdshufti.hi);
-        dumpShuftiCharReach(f, (const u8 *)&accel.mdshufti.lo,
-                            (const u8 *)&accel.mdshufti.hi);
-        break;
-    case ACCEL_MLTRUFFLE:
-    case ACCEL_MLGTRUFFLE:
-    case ACCEL_MSTRUFFLE:
-    case ACCEL_MSGTRUFFLE:
-        fprintf(f, " len:%u\n", accel.mtruffle.len);
-        dumpTruffleMasks(f, (const u8 *)&accel.mtruffle.mask1,
-                         (const u8 *)&accel.mtruffle.mask2);
-        dumpTruffleCharReach(f, (const u8 *)&accel.mtruffle.mask1,
-                             (const u8 *)&accel.mtruffle.mask2);
-        break;
-    case ACCEL_MDSTRUFFLE:
-    case ACCEL_MDSGTRUFFLE:
-        fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
-        dumpTruffleMasks(f, (const u8 *)&accel.mdtruffle.mask1,
-                         (const u8 *)&accel.mdtruffle.mask2);
-        dumpTruffleCharReach(f, (const u8 *)&accel.mdtruffle.mask1,
-                             (const u8 *)&accel.mdtruffle.mask2);
-        break;
    default:
        fprintf(f, "\n");
        break;
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -225,274 +225,6 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
    aux->accel_type = ACCEL_NONE;
 }

-static
-void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
-    if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
-        DEBUG_PRINTF("no multimatch for us :(");
-        return;
-    }
-
-    u32 offset = info.multiaccel_offset;
-    const CharReach &stops = info.multiaccel_stops;
-
-    assert(aux->accel_type == ACCEL_NONE);
-    if (stops.all()) {
-        return;
-    }
-
-    size_t outs = stops.count();
-    DEBUG_PRINTF("%zu outs\n", outs);
-    assert(outs && outs < 256);
-
-    switch (info.ma_type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MLVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MLVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MLGVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MLGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_SHIFT:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MSVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MSVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MSGVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MSGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MDSVERM;
-            aux->mdverm.offset = offset;
-            aux->mdverm.c = stops.find_first();
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MDSVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MDSGVERM;
-            aux->mdverm.offset = offset;
-            aux->mdverm.c = stops.find_first();
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MDSGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    default:
-        // shouldn't happen
-        assert(0);
-        return;
-    }
-
-    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
-
-    switch (info.ma_type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MLSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MLGSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_SHIFT:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MSSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MSGSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
-                (u8 *)&aux->mdshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MDSSHUFTI;
-        aux->mdshufti.offset = offset;
-        aux->mdshufti.len1 = info.ma_len1;
-        aux->mdshufti.len2 = info.ma_len2;
-        return;
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
-                (u8 *)&aux->mdshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MDSGSHUFTI;
-        aux->mdshufti.offset = offset;
-        aux->mdshufti.len1 = info.ma_len1;
-        aux->mdshufti.len2 = info.ma_len2;
-        return;
-    default:
-        // shouldn't happen
-        assert(0);
-        return;
-    }
-    DEBUG_PRINTF("shufti build failed, falling through\n");
-
-    if (outs <= ACCEL_MAX_STOP_CHAR) {
-        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
-        switch (info.ma_type) {
-        case MultibyteAccelInfo::MAT_LONG:
-            aux->accel_type = ACCEL_MLTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_LONGGRAB:
-            aux->accel_type = ACCEL_MLGTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_SHIFT:
-            aux->accel_type = ACCEL_MSTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_SHIFTGRAB:
-            aux->accel_type = ACCEL_MSGTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_DSHIFT:
-            aux->accel_type = ACCEL_MDSTRUFFLE;
-            aux->mdtruffle.offset = offset;
-            aux->mdtruffle.len1 = info.ma_len1;
-            aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mdtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-            aux->accel_type = ACCEL_MDSGTRUFFLE;
-            aux->mdtruffle.offset = offset;
-            aux->mdtruffle.len1 = info.ma_len1;
-            aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mdtruffle.mask2);
-            break;
-        default:
-            // shouldn't happen
-            assert(0);
-            return;
-        }
-        return;
-    }
-
-    DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
-}
-
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
    assert(aux->accel_type == ACCEL_NONE);
    if (info.single_stops.none()) {
@ -500,9 +232,6 @@ bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
        aux->accel_type = ACCEL_RED_TAPE;
        aux->generic.offset = info.single_offset;
    }
-    if (aux->accel_type == ACCEL_NONE) {
-        buildAccelMulti(info, aux);
-    }
    if (aux->accel_type == ACCEL_NONE) {
        buildAccelDouble(info, aux);
    }
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,30 +37,9 @@ union AccelAux;

 namespace ue2 {

-struct MultibyteAccelInfo {
-    /* multibyte accel schemes, ordered by strength */
-    enum multiaccel_type {
-        MAT_SHIFT,
-        MAT_SHIFTGRAB,
-        MAT_DSHIFT,
-        MAT_DSHIFTGRAB,
-        MAT_LONG,
-        MAT_LONGGRAB,
-        MAT_MAX,
-        MAT_NONE = MAT_MAX
-    };
-    CharReach cr;
-    u32 offset = 0;
-    u32 len1 = 0;
-    u32 len2 = 0;
-    multiaccel_type type = MAT_NONE;
-};
-
 struct AccelInfo {
    AccelInfo() : single_offset(0U), double_offset(0U),
-                  single_stops(CharReach::dot()),
-                  multiaccel_offset(0), ma_len1(0), ma_len2(0),
-                  ma_type(MultibyteAccelInfo::MAT_NONE) {}
+                  single_stops(CharReach::dot()) {}
    u32 single_offset; /**< offset correction to apply to single schemes */
    u32 double_offset; /**< offset correction to apply to double schemes */
    CharReach double_stop1;  /**<  single-byte accel stop literals for double
@ -68,11 +47,6 @@ struct AccelInfo {
    flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
                                               * literals */
    CharReach single_stops; /**< escapes for single byte acceleration */
-    u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
-    CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
-    u32 ma_len1; /**< multiaccel len1 */
-    u32 ma_len2; /**< multiaccel len2 */
-    MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
 };

 bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,9 +26,11 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
+/**
+ * \file
 * \brief Castle: multi-tenant repeat engine, compiler code.
 */
+
 #include "castlecompile.h"

 #include "castle_internal.h"
@ -439,7 +441,7 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
    }
 }

-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildCastle(const CastleProto &proto,
            const map<u32, vector<vector<CharReach>>> &triggers,
            const CompileContext &cc, const ReportManager &rm) {
@ -501,7 +503,7 @@ buildCastle(const CastleProto &proto,
        // possibly means that we've got a repeat that we can't trigger. We do
        // need to cope with it though.
        if (contains(triggers, top)) {
-            min_period = minPeriod(triggers.at(top), cr, &is_reset);
+            min_period = depth(minPeriod(triggers.at(top), cr, &is_reset));
        }

        if (min_period > pr.bounds.max) {
@ -560,7 +562,7 @@ buildCastle(const CastleProto &proto,
    DEBUG_PRINTF("%zu subcastles may go stale\n", may_stale.size());
    vector<mmbit_sparse_iter> stale_iter;
    if (!may_stale.empty()) {
-        mmbBuildSparseIterator(stale_iter, may_stale, numRepeats);
+        stale_iter = mmbBuildSparseIterator(may_stale, numRepeats);
    }


@ -577,7 +579,7 @@ buildCastle(const CastleProto &proto,
    total_size = ROUNDUP_N(total_size, alignof(mmbit_sparse_iter));
    total_size += byte_length(stale_iter); // stale sparse iter

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    nfa->type = verify_u8(CASTLE_NFA);
    nfa->length = verify_u32(total_size);
    nfa->nPositions = verify_u32(subs.size());
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,8 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
+/**
+ * \file
 * \brief Castle: multi-tenant repeat engine, compiler code.
 */

@ -36,7 +37,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"
 #include "nfagraph/ng_repeat.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/depth.h"
 #include "util/ue2_containers.h"

@ -120,7 +121,7 @@ void remapCastleTops(CastleProto &proto, std::map<u32, u32> &top_map);
 * NOTE: Tops must be contiguous, i.e. \ref remapCastleTops must have been run
 * first.
 */
-ue2::aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildCastle(const CastleProto &proto,
            const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
            const CompileContext &cc, const ReportManager &rm);
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,12 +26,14 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
-* \brief Build code for DFA minimization
-*/
+/**
+ * \file
+ * \brief Build code for DFA minimization.
+ */

 /**
- * /Summary of the Hopcrofts algorithm/
+ * /Summary of the Hopcroft minimisation algorithm/
+ *
 * partition := {F, Q \ F};
 * work_queue := {F};
 * while (work_queue is not empty) do
@ -57,22 +59,20 @@
 #include "dfa_min.h"

 #include "grey.h"
-#include "nfa/rdfa.h"
-#include "nfagraph/ng_mcclellan.h"
+#include "rdfa.h"
 #include "ue2common.h"
-#include "util/partitioned_set.h"
 #include "util/container.h"
+#include "util/noncopyable.h"
+#include "util/partitioned_set.h"
 #include "util/ue2_containers.h"

 #include <algorithm>
 #include <functional>
+#include <iterator>
 #include <map>
+#include <queue>
 #include <set>
 #include <vector>
-#include <iterator>
-
-#include <boost/core/noncopyable.hpp>
-#include <boost/dynamic_bitset.hpp>

 using namespace std;

@ -81,118 +81,81 @@ namespace ue2 {
 namespace {

 struct hopcroft_state_info {
-    vector<vector<dstate_id_t> > prev;
+    explicit hopcroft_state_info(size_t alpha_size) : prev(alpha_size) {}
+
+    /** \brief Mapping from symbol to a list of predecessors that transition to
+     * this state on that symbol. */
+    vector<vector<dstate_id_t>> prev;
 };

-struct DFA_components : boost::noncopyable {
-    dstate_id_t nstates;
-    size_t inp_size;
-    set<size_t> work_queue;
-    /*Partition contains reduced states*/
-    partitioned_set<dstate_id_t> partition;
-    vector<hopcroft_state_info> states;
+struct HopcroftInfo : noncopyable {
+    size_t alpha_size; //!< Size of DFA alphabet.
+    queue<size_t> work_queue; //!< Hopcroft work queue of partition indices.
+    partitioned_set<dstate_id_t> partition; //!< Partition set of DFA states.
+    vector<hopcroft_state_info> states; //!< Pre-calculated state info (preds)

-    explicit DFA_components(const raw_dfa &rdfa);
+    explicit HopcroftInfo(const raw_dfa &rdfa);
 };

-} //namespace
+} // namespace

 /**
- * create_map:
- *   Creates an initial partitioning and work_queue.
- *   Initial partition contains {accepting states..., Non-accepting states}
- *   Initial work_queue contains accepting state subsets
+ * \brief Create an initial partitioning and work_queue.
 *
- *   The initial partitioning needs to distinguish between the different
- *   reporting behaviours (unlike standard hopcroft) --> more than one subset
- *   possible for the accepting states.
+ * Initial partition contains {accepting states..., Non-accepting states}
+ * Initial work_queue contains accepting state subsets
 *
- *   Look for accepting states in both reports and reports_eod.
- *   Creates a map with a key(reports, reports_eod) and an id.
- *   Reports of each state are searched against the map and
- *   added to the corresponding id -> partition[id] and work_queue[id].
- *   Non Accept states are added to partition[id+1].
+ * The initial partitioning needs to distinguish between the different
+ * reporting behaviours (unlike standard Hopcroft) --> more than one subset
+ * possible for the accepting states.
+ *
+ * Look for accepting states in both reports and reports_eod.
+ * Creates a map with a key(reports, reports_eod) and an id.
+ * Reports of each state are searched against the map and
+ * added to the corresponding id -> partition[id] and work_queue[id].
+ * Non Accept states are added to partition[id+1].
 */
 static
-vector<size_t> create_map(const raw_dfa &rdfa, set<size_t> &work_queue) {
+vector<size_t> create_map(const raw_dfa &rdfa, queue<size_t> &work_queue) {
    using ReportKey = pair<flat_set<ReportID>, flat_set<ReportID>>;
    map<ReportKey, size_t> subset_map;
    vector<size_t> state_to_subset(rdfa.states.size(), INVALID_SUBSET);

    for (size_t i = 0; i < rdfa.states.size(); i++) {
-        if (!rdfa.states[i].reports.empty() ||
-            !rdfa.states[i].reports_eod.empty()) {
-            ReportKey key(rdfa.states[i].reports, rdfa.states[i].reports_eod);
+        const auto &ds = rdfa.states[i];
+        if (!ds.reports.empty() || !ds.reports_eod.empty()) {
+            ReportKey key(ds.reports, ds.reports_eod);
            if (contains(subset_map, key)) {
                state_to_subset[i] = subset_map[key];
            } else {
                size_t sub = subset_map.size();
-                subset_map[key] = sub;
+                subset_map.emplace(std::move(key), sub);
                state_to_subset[i] = sub;
-                work_queue.insert(sub);
+                work_queue.push(sub);
            }
        }
    }

-    /* handle non accepts */
+    /* Give non-accept states their own subset. */
    size_t non_accept_sub = subset_map.size();
-    for (size_t i = 0; i < state_to_subset.size(); i++) {
-        if (state_to_subset[i] == INVALID_SUBSET) {
-            state_to_subset[i] = non_accept_sub;
-        }
-    }
+    replace(state_to_subset.begin(), state_to_subset.end(), INVALID_SUBSET,
+            non_accept_sub);

    return state_to_subset;
 }

-DFA_components::DFA_components(const raw_dfa &rdfa)
-                             : nstates(rdfa.states.size()),
-                               inp_size(rdfa.states[nstates - 1].next.size()),
-                               partition(create_map(rdfa, work_queue)) {
-    /* initializing states */
-    for (size_t i = 0; i < nstates; i++) {
-        states.push_back(hopcroft_state_info());
-        states.back().prev.resize(inp_size);
-    }
-
-    for (size_t i = 0; i < nstates; i++) {  // i is the previous state
-        for (size_t  j = 0; j < inp_size; j++) {
-            /* Creating X_table */
-            dstate_id_t present_state = rdfa.states[i].next[j];
-            states[present_state].prev[j].push_back(i);
-
-            DEBUG_PRINTF("rdfa.states[%zu].next[%zu] %hu \n", i, j,
-                         rdfa.states[i].next[j]);
+HopcroftInfo::HopcroftInfo(const raw_dfa &rdfa)
+    : alpha_size(rdfa.alpha_size), partition(create_map(rdfa, work_queue)),
+      states(rdfa.states.size(), hopcroft_state_info(alpha_size)) {
+    /* Construct predecessor lists for each state, indexed by symbol. */
+    for (size_t i = 0; i < states.size(); i++) { // i is the previous state
+        for (size_t sym = 0; sym < alpha_size; sym++) {
+            dstate_id_t present_state = rdfa.states[i].next[sym];
+            states[present_state].prev[sym].push_back(i);
        }
    }
 }

-/**
- * choose and remove a set A from work_queue.
- */
-static
-void get_work_item(DFA_components &mdfa, ue2::flat_set<dstate_id_t> &A) {
-    A.clear();
-    assert(!mdfa.work_queue.empty());
-    set<size_t>::iterator pt = mdfa.work_queue.begin();
-    insert(&A, mdfa.partition[*pt]);
-    mdfa.work_queue.erase(pt);
-}
-
-/**
- * X is the set of states for which a transition on the input leads to a state
- * in A.
- */
-static
-void create_X(const DFA_components &mdfa, const ue2::flat_set<dstate_id_t> &A,
-              size_t inp, ue2::flat_set<dstate_id_t> &X) {
-    X.clear();
-
-    for (dstate_id_t id : A) {
-        insert(&X, mdfa.states[id].prev[inp]);
-    }
-}
-
 /**
 * For a split set X, each subset S (given by part_index) in the partition, two
 * sets are created: v_inter (X intersection S) and v_sub (S - X).
@ -206,14 +169,14 @@ void create_X(const DFA_components &mdfa, const ue2::flat_set<dstate_id_t> &A,
 *      - replace S in work_queue by the smaller of the two sets.
 */
 static
-void split_and_replace_set(const size_t part_index, DFA_components &mdfa,
-                           const ue2::flat_set<dstate_id_t> &splitter) {
+void split_and_replace_set(const size_t part_index, HopcroftInfo &info,
+                           const flat_set<dstate_id_t> &splitter) {
    /* singleton sets cannot be split */
-    if (mdfa.partition[part_index].size() == 1) {
+    if (info.partition[part_index].size() == 1) {
        return;
    }

-    size_t small_index = mdfa.partition.split(part_index, splitter);
+    size_t small_index = info.partition.split(part_index, splitter);

    if (small_index == INVALID_SUBSET) {
        /* the set could not be split */
@ -223,54 +186,56 @@ void split_and_replace_set(const size_t part_index, DFA_components &mdfa,
    /* larger subset remains at the input subset index, if the input subset was
     * already in the work queue then the larger subset will remain there. */

-    mdfa.work_queue.insert(small_index);
+    info.work_queue.push(small_index);
 }

 /**
- * The complete Hopcrofts algorithm is implemented in this function.
- * Choose and remove a set tray from work_queue
- * For each input- X is created.
- * For each subset in the partition, split_and_replace_sets are called with the
- * split set.
+ * \brief Core of the Hopcroft minimisation algorithm.
 */
 static
-void dfa_min(DFA_components &mdfa) {
-    ue2::flat_set<dstate_id_t> A, X;
+void dfa_min(HopcroftInfo &info) {
+    flat_set<dstate_id_t> curr, sym_preds;
    vector<size_t> cand_subsets;

-    while (!mdfa.work_queue.empty()) {
-        get_work_item(mdfa, A);
+    while (!info.work_queue.empty()) {
+        /* Choose and remove a set of states (curr, or A in the description
+         * above) from the work queue. Note that we copy the set because the
+         * partition may be split by the loop below. */
+        curr.clear();
+        insert(&curr, info.partition[info.work_queue.front()]);
+        info.work_queue.pop();

-        for (size_t inp = 0; inp < mdfa.inp_size; inp++) {
-            create_X(mdfa, A, inp, X);
-            if (X.empty()) {
+        for (size_t sym = 0; sym < info.alpha_size; sym++) {
+            /* Find the set of states sym_preds for which a transition on the
+             * given symbol leads to a state in curr. */
+            sym_preds.clear();
+            for (dstate_id_t s : curr) {
+                insert(&sym_preds, info.states[s].prev[sym]);
+            }
+
+            if (sym_preds.empty()) {
                continue;
            }

-            /* we only need to consider subsets with at least one member in X for
-             * splitting */
+            /* we only need to consider subsets with at least one member in
+             * sym_preds for splitting */
            cand_subsets.clear();
-            mdfa.partition.find_overlapping(X, &cand_subsets);
+            info.partition.find_overlapping(sym_preds, &cand_subsets);

            for (size_t sub : cand_subsets) {
-                split_and_replace_set(sub, mdfa, X);
+                split_and_replace_set(sub, info, sym_preds);
            }
        }
    }
 }

 /**
- * Creating new dfa table
- * Map ordering contains key being an equivalence classes first state
- * and the value being the equivalence class index.
- * Eq_state[i] tells us new state id the equivalence class located at
- * partition[i].
+ * \brief Build the new DFA state table.
 */
 static
-void mapping_new_states(const DFA_components &mdfa,
-                        vector<dstate_id_t> &old_to_new,
-                        raw_dfa &rdfa) {
-    const size_t num_partitions = mdfa.partition.size();
+void mapping_new_states(const HopcroftInfo &info,
+                        vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
+    const size_t num_partitions = info.partition.size();

    // Mapping from equiv class's first state to equiv class index.
    map<dstate_id_t, size_t> ordering;
@ -279,7 +244,7 @@ void mapping_new_states(const DFA_components &mdfa,
    vector<dstate_id_t> eq_state(num_partitions);

    for (size_t i = 0; i < num_partitions; i++) {
-        ordering[*mdfa.partition[i].begin()] = i;
+        ordering[*info.partition[i].begin()] = i;
    }

    dstate_id_t new_id = 0;
@ -287,30 +252,28 @@ void mapping_new_states(const DFA_components &mdfa,
        eq_state[m.second] = new_id++;
    }

-    for (size_t t = 0; t < mdfa.partition.size(); t++) {
-        for (dstate_id_t id : mdfa.partition[t]) {
+    for (size_t t = 0; t < info.partition.size(); t++) {
+        for (dstate_id_t id : info.partition[t]) {
            old_to_new[id] = eq_state[t];
        }
    }

    vector<dstate> new_states;
    new_states.reserve(num_partitions);
-    for (size_t i = 0; i < mdfa.nstates; i++) {
-        if (contains(ordering, i)) {
-            new_states.push_back(rdfa.states[i]);
-        }
+
+    for (const auto &m : ordering) {
+        new_states.push_back(rdfa.states[m.first]);
    }
-    rdfa.states.swap(new_states);
+    rdfa.states = std::move(new_states);
 }

 static
-void renumber_new_states(const DFA_components &mdfa,
-                         const vector<dstate_id_t> &old_to_new,
-                         raw_dfa &rdfa) {
-    for (size_t i = 0; i < mdfa.partition.size(); i++) {
-        for (size_t j = 0; j < mdfa.inp_size; j++) {
-            dstate_id_t output = rdfa.states[i].next[j];
-            rdfa.states[i].next[j] = old_to_new[output];
+void renumber_new_states(const HopcroftInfo &info,
+                         const vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
+    for (size_t i = 0; i < info.partition.size(); i++) {
+        for (size_t sym = 0; sym < info.alpha_size; sym++) {
+            dstate_id_t output = rdfa.states[i].next[sym];
+            rdfa.states[i].next[sym] = old_to_new[output];
        }
        dstate_id_t dad = rdfa.states[i].daddy;
        rdfa.states[i].daddy = old_to_new[dad];
@ -321,17 +284,16 @@ void renumber_new_states(const DFA_components &mdfa,
 }

 static
-void new_dfa(raw_dfa &rdfa, const DFA_components &mdfa) {
-    if (mdfa.partition.size() != mdfa.nstates) {
-        vector<dstate_id_t> old_to_new(mdfa.nstates);
-        mapping_new_states(mdfa, old_to_new, rdfa);
-        renumber_new_states(mdfa, old_to_new, rdfa);
+void new_dfa(raw_dfa &rdfa, const HopcroftInfo &info) {
+    if (info.partition.size() == info.states.size()) {
+        return;
    }
+
+    vector<dstate_id_t> old_to_new(info.states.size());
+    mapping_new_states(info, old_to_new, rdfa);
+    renumber_new_states(info, old_to_new, rdfa);
 }

-/**
- * MAIN FUNCTION
- */
 void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
    if (!grey.minimizeDFA) {
        return;
@ -339,10 +301,10 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {

    UNUSED const size_t states_before = rdfa.states.size();

-    DFA_components mdfa(rdfa);
+    HopcroftInfo info(rdfa);

-    dfa_min(mdfa);
-    new_dfa(rdfa, mdfa);
+    dfa_min(info);
+    new_dfa(rdfa, info);

    DEBUG_PRINTF("reduced from %zu to %zu states\n", states_before,
                 rdfa.states.size());
--- a/src/nfa/dfa_min.h
+++ b/src/nfa/dfa_min.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,8 +26,9 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
- * \brief Build code for McClellan DFA.
+/**
+ * \file
+ * \brief Build code for DFA minimization.
 */

 #ifndef DFA_MIN_H
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -35,7 +35,6 @@
 #include "grey.h"
 #include "mcclellancompile.h"
 #include "nfa_internal.h"
-#include "util/alloc.h"
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/graph_range.h"
@ -81,7 +80,7 @@ public:
    gough_build_strat(
        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm_in,
        const map<dstate_id_t, gough_accel_state_info> &accel_info)
-        : mcclellan_build_strat(r, rm_in), rdfa(r), gg(g),
+        : mcclellan_build_strat(r, rm_in, false), rdfa(r), gg(g),
          accel_gough_info(accel_info) {}
    unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
                            vector<u32> &reports_eod /* out */,
@ -1036,9 +1035,9 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
    }
 }

-aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm) {
+bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
+                               const CompileContext &cc,
+                               const ReportManager &rm) {
    assert(somPrecision == 2 || somPrecision == 4 || somPrecision == 8
           || !cc.streaming);

@ -1071,7 +1070,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    map<dstate_id_t, gough_accel_state_info> accel_allowed;
    find_allowed_accel_states(*cfg, blocks, &accel_allowed);
    gough_build_strat gbs(raw, *cfg, rm, accel_allowed);
-    aligned_unique_ptr<NFA> basic_dfa = mcclellanCompile_i(raw, gbs, cc);
+    auto basic_dfa = mcclellanCompile_i(raw, gbs, cc);
    assert(basic_dfa);
    if (!basic_dfa) {
        return nullptr;
@ -1117,7 +1116,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    gi.stream_som_loc_width = somPrecision;

    u32 gough_size = ROUNDUP_N(curr_offset, 16);
-    aligned_unique_ptr<NFA> gough_dfa = aligned_zmalloc_unique<NFA>(gough_size);
+    auto gough_dfa = make_zeroed_bytecode_ptr<NFA>(gough_size);

    memcpy(gough_dfa.get(), basic_dfa.get(), basic_dfa->length);
    memcpy((char *)gough_dfa.get() + haig_offset, &gi, sizeof(gi));
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -32,7 +32,7 @@
 #include "mcclellancompile.h"
 #include "nfa_kind.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"
 #include "util/order_check.h"

@ -88,10 +88,10 @@ struct raw_som_dfa : public raw_dfa {
                            * som */
 };

-aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm);
+bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
+                               const CompileContext &cc,
+                               const ReportManager &rm);

 } // namespace ue2

-#endif
+#endif // GOUGHCOMPILE_H
--- a/src/nfa/goughcompile_internal.h
+++ b/src/nfa/goughcompile_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
 #include "mcclellancompile.h"
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/noncopyable.h"
 #include "util/order_check.h"
 #include "util/ue2_containers.h"

@ -41,7 +42,6 @@
 #include <set>
 #include <vector>

-#include <boost/core/noncopyable.hpp>
 #include <boost/graph/adjacency_list.hpp>

 namespace ue2 {
@ -103,7 +103,7 @@ struct GoughSSAVarWithInputs;
 struct GoughSSAVarMin;
 struct GoughSSAVarJoin;

-struct GoughSSAVar : boost::noncopyable {
+struct GoughSSAVar : noncopyable {
    GoughSSAVar(void) : seen(false), slot(INVALID_SLOT) {}
    virtual ~GoughSSAVar();
    const ue2::flat_set<GoughSSAVar *> &get_inputs() const {
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -39,11 +39,9 @@
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "multishufti.h"
-#include "multitruffle.h"
-#include "multivermicelli.h"
 #include "ue2common.h"
 #include "vermicelli.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

@ -118,7 +116,7 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
    DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n");
    m256 accelPerm = limex->accelPermute;
    m256 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
    u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
    assert((idx1 & idx2) == 0); // should be no shared bits
@ -153,18 +151,20 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
    DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
    m512 accelPerm = limex->accelPermute;
    m512 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if defined(HAVE_AVX512)
+    idx = packedExtract512(s, accelPerm, accelComp);
+#elif defined(HAVE_AVX2)
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
    u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
    assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
    idx = idx1 | idx2 | idx3 | idx4;
-#else
-    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
-    assert((idx1 & idx2) == 0); // should be no shared bits
-    idx = idx1 | idx2;
 #endif
    return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -26,9 +26,11 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
+/**
+ * \file
 * \brief Main NFA build code.
 */
+
 #include "limex_compile.h"

 #include "accel.h"
@ -47,6 +49,7 @@
 #include "repeatcompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
 #include "util/compile_context.h"
 #include "util/container.h"
@ -66,6 +69,7 @@
 #include <vector>

 #include <boost/graph/breadth_first_search.hpp>
+#include <boost/graph/depth_first_search.hpp>
 #include <boost/range/adaptor/map.hpp>

 using namespace std;
@ -89,8 +93,6 @@ struct precalcAccel {
    CharReach double_cr;
    flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
    u32 double_offset;
-
-    MultibyteAccelInfo ma_info;
 };

 struct limex_accel_info {
@ -354,16 +356,12 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }

 struct AccelBuild {
-    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0), ma_len1(0),
-            ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
+    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0) {}
    NFAVertex v;
    u32 state;
    u32 offset; // offset correction to apply
    CharReach stop1; // single-byte accel stop literals
    flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
-    u32 ma_len1; // multiaccel len1
-    u32 ma_len2; // multiaccel len2
-    MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
 };

 static
@ -378,12 +376,7 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
        build.stop1 = CharReach::dot();
    } else {
        const precalcAccel &precalc = bi.accel.precalc.at(ss);
-        unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
-        if (ma_len >= MULTIACCEL_MIN_LEN) {
-            build.ma_len1 = precalc.ma_info.len1;
-            build.stop1 = precalc.ma_info.cr;
-            build.offset = precalc.ma_info.offset;
-        } else if (precalc.double_lits.empty()) {
+        if (precalc.double_lits.empty()) {
            build.stop1 = precalc.single_cr;
            build.offset = precalc.single_offset;
        } else {
@ -602,7 +595,6 @@ void fillAccelInfo(build_info &bi) {
    limex_accel_info &accel = bi.accel;
    unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
    const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
-    const CompileContext &cc = bi.cc;
    const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
    const u32 num_states = bi.num_states;

@ -659,27 +651,17 @@ void fillAccelInfo(build_info &bi) {
        DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
                     as.double_offset);

-        // try multibyte acceleration first
-        MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
-
        precalcAccel &pa = accel.precalc[state_set];
-        useful |= state_set;
-
-        // if we successfully built a multibyte accel scheme, use that
-        if (mai.type != MultibyteAccelInfo::MAT_NONE) {
-            pa.ma_info = mai;
-
-            DEBUG_PRINTF("multibyte acceleration!\n");
-            continue;
-        }
-
        pa.single_offset = as.offset;
        pa.single_cr = as.cr;
+
        if (as.double_byte.size() != 0) {
            pa.double_offset = as.double_offset;
            pa.double_lits = as.double_byte;
            pa.double_cr = as.double_cr;
-        };
+        }
+
+        useful |= state_set;
    }

    for (const auto &m : accel_map) {
@ -696,19 +678,8 @@ void fillAccelInfo(build_info &bi) {
        state_set.reset();
        state_set.set(state_id);

-        bool is_multi = false;
-        auto p_it = accel.precalc.find(state_set);
-        if (p_it != accel.precalc.end()) {
-            const precalcAccel &pa = p_it->second;
-            offset = max(pa.double_offset, pa.single_offset);
-            is_multi = pa.ma_info.type != MultibyteAccelInfo::MAT_NONE;
-            assert(offset <= MAX_ACCEL_DEPTH);
-        }
-
        accel.accelerable.insert(v);
-        if (!is_multi) {
-            findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
-        }
+        findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
    }
 }

@ -721,6 +692,7 @@ typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>

 static
 u32 getEffectiveAccelStates(const build_info &args,
+                            const unordered_map<NFAVertex, NFAVertex> &dom_map,
                            u32 active_accel_mask,
                            const vector<AccelBuild> &accelStates) {
    /* accelStates is indexed by the acceleration bit index and contains a
@ -756,7 +728,6 @@ u32 getEffectiveAccelStates(const build_info &args,
     * so we may still require on earlier states to be accurately modelled.
     */
    const NGHolder &h = args.h;
-    auto dom_map = findDominators(h);

    /* map from accel_id to mask of accel_ids that it is dominated by */
    vector<u32> dominated_by(accelStates.size());
@ -773,8 +744,8 @@ u32 getEffectiveAccelStates(const build_info &args,
        u32 accel_id = findAndClearLSB_32(&local_accel_mask);
        assert(accel_id < accelStates.size());
        NFAVertex v = accelStates[accel_id].v;
-        while (dom_map[v]) {
-            v = dom_map[v];
+        while (contains(dom_map, v) && dom_map.at(v)) {
+            v = dom_map.at(v);
            if (contains(accel_id_map, v)) {
                dominated_by[accel_id] |= 1U << accel_id_map[v];
            }
@ -887,6 +858,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
        return;
    }

+    const auto dom_map = findDominators(args.h);
+
    // We have 2^n different accel entries, one for each possible
    // combination of accelerable states.
    assert(accelStates.size() < 32);
@ -900,7 +873,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
    effective_accel_set.push_back(0); /* empty is effectively empty */

    for (u32 i = 1; i < accelCount; i++) {
-        u32 effective_i = getEffectiveAccelStates(args, i, accelStates);
+        u32 effective_i = getEffectiveAccelStates(args, dom_map, i,
+                                                  accelStates);
        effective_accel_set.push_back(effective_i);

        if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
@ -947,16 +921,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,

            if (contains(accel.precalc, effective_states)) {
                const auto &precalc = accel.precalc.at(effective_states);
-                if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
-                    ainfo.ma_len1 = precalc.ma_info.len1;
-                    ainfo.ma_len2 = precalc.ma_info.len2;
-                    ainfo.multiaccel_offset = precalc.ma_info.offset;
-                    ainfo.multiaccel_stops = precalc.ma_info.cr;
-                    ainfo.ma_type = precalc.ma_info.type;
-                } else {
-                    ainfo.single_offset = precalc.single_offset;
-                    ainfo.single_stops = precalc.single_cr;
-                }
+                ainfo.single_offset = precalc.single_offset;
+                ainfo.single_stops = precalc.single_cr;
            }
        }

@ -1637,6 +1603,84 @@ u32 findBestNumOfVarShifts(const build_info &args,
    return bestNumOfVarShifts;
 }

+static
+bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
+    const auto &h = args.h;
+
+    // When this top is activated, all of the vertices in 'tops' are switched
+    // on. If any of those lead to a graph that cannot die, then this top
+    // cannot die.
+
+    // For each top, we use a depth-first search to traverse the graph from the
+    // top, looking for a cyclic path consisting of vertices of dot reach. If
+    // one exists, than the NFA cannot die after this top is triggered.
+
+    vector<boost::default_color_type> colours(num_vertices(h));
+    auto colour_map = boost::make_iterator_property_map(colours.begin(),
+                                                        get(vertex_index, h));
+
+    struct CycleFound {};
+    struct CannotDieVisitor : public boost::default_dfs_visitor {
+        void back_edge(const NFAEdge &e, const NGHolder &g) const {
+            DEBUG_PRINTF("back-edge %zu,%zu\n", g[source(e, g)].index,
+                         g[target(e, g)].index);
+            if (g[target(e, g)].char_reach.all()) {
+                assert(g[source(e, g)].char_reach.all());
+                throw CycleFound();
+            }
+        }
+    };
+
+    try {
+        for (const auto &top : tops) {
+            DEBUG_PRINTF("checking top vertex %zu\n", h[top].index);
+
+            // Constrain the search to the top vertices and any dot vertices it
+            // can reach.
+            auto term_func = [&](NFAVertex v, const NGHolder &g) {
+                if (v == top) {
+                    return false;
+                }
+                if (!g[v].char_reach.all()) {
+                    return true;
+                }
+                if (contains(args.br_cyclic, v) &&
+                    args.br_cyclic.at(v).repeatMax != depth::infinity()) {
+                    // Bounded repeat vertices without inf max can be turned
+                    // off.
+                    return true;
+                }
+                return false;
+            };
+
+            boost::depth_first_visit(h, top, CannotDieVisitor(), colour_map,
+                                     term_func);
+        }
+    } catch (const CycleFound &) {
+        DEBUG_PRINTF("cycle found\n");
+        return true;
+    }
+
+    return false;
+}
+
+/** \brief True if this NFA cannot ever be in no states at all. */
+static
+bool cannotDie(const build_info &args) {
+    const auto &h = args.h;
+    const auto &state_ids = args.state_ids;
+
+    // If we have a startDs we're actually using, we can't die.
+    if (state_ids.at(h.startDs) != NO_STATE) {
+        DEBUG_PRINTF("is using startDs\n");
+        return true;
+    }
+
+    return all_of_in(args.tops | map_values, [&](const set<NFAVertex> &verts) {
+        return cannotDie(args, verts);
+    });
+}
+
 template<NFAEngineType dtype>
 struct Factory {
    // typedefs for readability, for types derived from traits
@ -1700,8 +1744,8 @@ struct Factory {

    static
    void buildRepeats(const build_info &args,
-                vector<pair<aligned_unique_ptr<NFARepeatInfo>, size_t>> &out,
-                u32 *scratchStateSize, u32 *streamState) {
+                      vector<bytecode_ptr<NFARepeatInfo>> &out,
+                      u32 *scratchStateSize, u32 *streamState) {
        out.reserve(args.repeats.size());

        u32 repeat_idx = 0;
@ -1712,7 +1756,7 @@ struct Factory {

            u32 tableOffset, tugMaskOffset;
            size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
-            auto info = aligned_zmalloc_unique<NFARepeatInfo>(len);
+            auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
            char *info_ptr = (char *)info.get();

            // Collect state space info.
@ -1766,7 +1810,7 @@ struct Factory {
            *streamState += streamStateLen;
            *scratchStateSize += sizeof(RepeatControl);

-            out.emplace_back(move(info), len);
+            out.emplace_back(move(info));
        }
    }

@ -2074,8 +2118,7 @@ struct Factory {
    }

    static
-    void writeRepeats(const vector<pair<aligned_unique_ptr<NFARepeatInfo>,
-                                        size_t>> &repeats,
+    void writeRepeats(const vector<bytecode_ptr<NFARepeatInfo>> &repeats,
                      vector<u32> &repeatOffsets, implNFA_t *limex,
                      const u32 repeatOffsetsOffset, const u32 repeatOffset) {
        const u32 num_repeats = verify_u32(repeats.size());
@ -2088,10 +2131,9 @@ struct Factory {

        for (u32 i = 0; i < num_repeats; i++) {
            repeatOffsets[i] = offset;
-            assert(repeats[i].first);
-            memcpy((char *)limex + offset, repeats[i].first.get(),
-                   repeats[i].second);
-            offset += repeats[i].second;
+            assert(repeats[i]);
+            memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
+            offset += repeats[i].size();
        }

        // Write repeat offset lookup table.
@ -2112,19 +2154,19 @@ struct Factory {
    }

    static
-    aligned_unique_ptr<NFA> generateNfa(const build_info &args) {
+    bytecode_ptr<NFA> generateNfa(const build_info &args) {
        if (args.num_states > NFATraits<dtype>::maxStates) {
            return nullptr;
        }

        // Build bounded repeat structures.
-        vector<pair<aligned_unique_ptr<NFARepeatInfo>, size_t>> repeats;
+        vector<bytecode_ptr<NFARepeatInfo>> repeats;
        u32 repeats_full_state = 0;
        u32 repeats_stream_state = 0;
        buildRepeats(args, repeats, &repeats_full_state, &repeats_stream_state);
        size_t repeatSize = 0;
        for (size_t i = 0; i < repeats.size(); i++) {
-            repeatSize += repeats[i].second;
+            repeatSize += repeats[i].size();
        }

        // We track report lists that have already been written into the global
@ -2214,7 +2256,7 @@ struct Factory {

        size_t nfaSize = sizeof(NFA) + offset;
        DEBUG_PRINTF("nfa size %zu\n", nfaSize);
-        auto nfa = aligned_zmalloc_unique<NFA>(nfaSize);
+        auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
        assert(nfa); // otherwise we would have thrown std::bad_alloc

        implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
@ -2234,6 +2276,11 @@ struct Factory {
        limex->shiftCount = shiftCount;
        writeShiftMasks(args, limex);

+        if (cannotDie(args)) {
+            DEBUG_PRINTF("nfa cannot die\n");
+            setLimexFlag(limex, LIMEX_FLAG_CANNOT_DIE);
+        }
+
        // Determine the state required for our state vector.
        findStateSize(args, limex);

@ -2295,7 +2342,7 @@ struct Factory {

 template<NFAEngineType dtype>
 struct generateNfa {
-    static aligned_unique_ptr<NFA> call(const build_info &args) {
+    static bytecode_ptr<NFA> call(const build_info &args) {
        return Factory<dtype>::generateNfa(args);
    }
 };
@ -2392,17 +2439,15 @@ u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
    return rv;
 }

-aligned_unique_ptr<NFA> generate(NGHolder &h,
-                         const ue2::unordered_map<NFAVertex, u32> &states,
-                         const vector<BoundedRepeatData> &repeats,
-                         const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                         const map<NFAVertex, NFAStateSet> &squashMap,
-                         const map<u32, set<NFAVertex>> &tops,
-                         const set<NFAVertex> &zombies,
-                         bool do_accel,
-                         bool stateCompression,
-                         u32 hint,
-                         const CompileContext &cc) {
+bytecode_ptr<NFA> generate(NGHolder &h,
+                           const ue2::unordered_map<NFAVertex, u32> &states,
+                           const vector<BoundedRepeatData> &repeats,
+                           const map<NFAVertex, NFAStateSet> &reportSquashMap,
+                           const map<NFAVertex, NFAStateSet> &squashMap,
+                           const map<u32, set<NFAVertex>> &tops,
+                           const set<NFAVertex> &zombies, bool do_accel,
+                           bool stateCompression, u32 hint,
+                           const CompileContext &cc) {
    const u32 num_states = max_state(states) + 1;
    DEBUG_PRINTF("total states: %u\n", num_states);

--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,7 +26,8 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-/** \file
+/**
+ * \file
 * \brief Main NFA build code.
 */

@ -37,10 +38,10 @@
 #include <memory>
 #include <vector>

-#include "ue2common.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_squash.h" // for NFAStateSet
-#include "util/alloc.h"
+#include "ue2common.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"

 struct NFA;
@ -50,7 +51,8 @@ namespace ue2 {
 struct BoundedRepeatData;
 struct CompileContext;

-/** \brief Construct a LimEx NFA from an NGHolder.
+/**
+ * \brief Construct a LimEx NFA from an NGHolder.
 *
 * \param g Input NFA graph. Must have state IDs assigned.
 * \param repeats Bounded repeat information, if any.
@ -66,7 +68,7 @@ struct CompileContext;
 * \return a built NFA, or nullptr if no NFA could be constructed for this
 * graph.
 */
-aligned_unique_ptr<NFA> generate(NGHolder &g,
+bytecode_ptr<NFA> generate(NGHolder &g,
                        const ue2::unordered_map<NFAVertex, u32> &states,
                        const std::vector<BoundedRepeatData> &repeats,
                        const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -290,6 +290,20 @@ static
 void dumpLimexText(const limex_type *limex, FILE *f) {
    u32 size = limex_traits<limex_type>::size;

+    fprintf(f, "%u-bit LimEx NFA (%u shifts, %u exceptions)\n", size,
+            limex->shiftCount, limex->exceptionCount);
+    fprintf(f, "flags: ");
+    if (limex->flags & LIMEX_FLAG_COMPRESS_STATE) {
+        fprintf(f, "COMPRESS_STATE ");
+    }
+    if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
+        fprintf(f, "COMPRESS_MASKED ");
+    }
+    if (limex->flags & LIMEX_FLAG_CANNOT_DIE) {
+        fprintf(f, "CANNOT_DIE ");
+    }
+    fprintf(f, "\n\n");
+
    dumpMask(f, "init", (const u8 *)&limex->init, size);
    dumpMask(f, "init_dot_star", (const u8 *)&limex->initDS, size);
    dumpMask(f, "accept", (const u8 *)&limex->accept, size);
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@ -85,6 +85,7 @@

 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
+#define LIMEX_FLAG_CANNOT_DIE      4 /**< limex cannot have no states on */

 enum LimExTrigger {
    LIMEX_TRIGGER_NONE = 0,
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@ -60,6 +60,7 @@
 #define RUN_ACCEL_FN        JOIN(LIMEX_API_ROOT, _Run_Accel)
 #define RUN_EXCEPTIONS_FN   JOIN(LIMEX_API_ROOT, _Run_Exceptions)
 #define REV_STREAM_FN       JOIN(LIMEX_API_ROOT, _Rev_Stream)
+#define LOOP_NOACCEL_FN     JOIN(LIMEX_API_ROOT, _Loop_No_Accel)
 #define STREAM_FN           JOIN(LIMEX_API_ROOT, _Stream)
 #define STREAMCB_FN         JOIN(LIMEX_API_ROOT, _Stream_CB)
 #define STREAMFIRST_FN      JOIN(LIMEX_API_ROOT, _Stream_First)
@ -172,24 +173,75 @@ size_t RUN_ACCEL_FN(const STATE_T s, UNUSED const STATE_T accelMask,
        switch (limex_m->shiftCount) {                                         \
        case 8:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 7)); \
+            /* fallthrough */                                                  \
        case 7:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 6)); \
+            /* fallthrough */                                                  \
        case 6:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 5)); \
+            /* fallthrough */                                                  \
        case 5:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 4)); \
+            /* fallthrough */                                                  \
        case 4:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 3)); \
+            /* fallthrough */                                                  \
        case 3:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 2)); \
+            /* fallthrough */                                                  \
        case 2:                                                                \
            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 1)); \
+            /* fallthrough */                                                  \
        case 1:                                                                \
+            /* fallthrough */                                                  \
        case 0:                                                                \
            ;                                                                  \
        }                                                                      \
    } while (0)

+/**
+ * \brief LimEx NFAS inner loop without accel.
+ *
+ * Note that the "all zeroes" early death check is only performed if can_die is
+ * true.
+ *
+ */
+static really_inline
+char LOOP_NOACCEL_FN(const IMPL_NFA_T *limex, const u8 *input, size_t *loc,
+                     size_t length, STATE_T *s_ptr, struct CONTEXT_T *ctx,
+                     u64a offset, const char flags, u64a *final_loc,
+                     const char first_match, const char can_die) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+#if SIZE < 256
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
+#endif
+    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
+    STATE_T s = *s_ptr;
+
+    size_t i = *loc;
+    for (; i != length; i++) {
+        DUMP_INPUT(i);
+        if (can_die && ISZERO_STATE(s)) {
+            DEBUG_PRINTF("no states are switched on, early exit\n");
+            break;
+        }
+
+        STATE_T succ;
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
+
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
+            return MO_HALT_MATCHING;
+        }
+
+        u8 c = input[i];
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
+    }
+
+    *loc = i;
+    *s_ptr = s;
+    return MO_CONTINUE_MATCHING;
+}

 static really_inline
 char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
@ -202,7 +254,8 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
        = LOAD_FROM_ENG(&limex->accel_and_friends);
    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
 #endif
-    const u8 *accelTable = (const u8 *)((const char *)limex + limex->accelTableOffset);
+    const u8 *accelTable =
+        (const u8 *)((const char *)limex + limex->accelTableOffset);
    const union AccelAux *accelAux =
        (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
@ -221,24 +274,20 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    }

 without_accel:
-    for (; i != min_accel_offset; i++) {
-        DUMP_INPUT(i);
-        if (ISZERO_STATE(s)) {
-            DEBUG_PRINTF("no states are switched on, early exit\n");
-            ctx->s = s;
-            return MO_CONTINUE_MATCHING;
-        }
-
-        u8 c = input[i];
-        STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
-
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
-                              &succ, final_loc, ctx, flags, 0, first_match)) {
+    if (limex->flags & LIMEX_FLAG_CANNOT_DIE) {
+        const char can_die = 0;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
+        }
+    } else {
+        const char can_die = 1;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING;
        }
-
-        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
    }

 with_accel:
@ -279,7 +328,6 @@ with_accel:
            goto without_accel;
        }

-        u8 c = input[i];
        STATE_T succ;
        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);

@ -288,6 +336,7 @@ with_accel:
            return MO_HALT_MATCHING;
        }

+        u8 c = input[i];
        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
    }

@ -333,14 +382,13 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    u64a *final_loc = NULL;

    for (size_t i = length; i != 0; i--) {
-        DUMP_INPUT(i-1);
+        DUMP_INPUT(i - 1);
        if (ISZERO_STATE(s)) {
            DEBUG_PRINTF("no states are switched on, early exit\n");
            ctx->s = s;
            return MO_CONTINUE_MATCHING;
        }

-        u8 c = input[i-1];
        STATE_T succ;
        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);

@ -349,6 +397,7 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
            return MO_HALT_MATCHING;
        }

+        u8 c = input[i - 1];
        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
    }

@ -999,6 +1048,7 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef RUN_ACCEL_FN
 #undef RUN_EXCEPTIONS_FN
 #undef REV_STREAM_FN
+#undef LOOP_NOACCEL_FN
 #undef STREAM_FN
 #undef STREAMCB_FN
 #undef STREAMFIRST_FN
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -38,22 +38,23 @@
 #define LIMEX_SHUFFLE_H

 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

 static really_inline
 u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
-    m128 shuffled = pshufb(s, permute);
+    m128 shuffled = pshufb_m128(s, permute);
    m128 compared = and128(shuffled, compare);
    u16 rv = ~movemask128(eq128(compared, shuffled));
    return (u32)rv;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline
 u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
    // vpshufb doesn't cross lanes, so this is a bit of a cheat
-    m256 shuffled = vpshufb(s, permute);
+    m256 shuffled = pshufb_m256(s, permute);
    m256 compared = and256(shuffled, compare);
    u32 rv = ~movemask256(eq256(compared, shuffled));
    // stitch the lane-wise results back together
@ -61,4 +62,17 @@ u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
 }
 #endif // AVX2

+#if defined(HAVE_AVX512)
+static really_inline
+u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m512 shuffled = pshufb_m512(s, permute);
+    m512 compared = and512(shuffled, compare);
+    u64a rv = ~eq512mask(compared, shuffled);
+    // stitch the lane-wise results back together
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+#endif // AVX512
+
 #endif // LIMEX_SHUFFLE_H
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -456,9 +456,8 @@ bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
 }

 static
-aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
-                                           const CompileContext &cc,
-                                           set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
+                                     set<dstate_id_t> *accel_states) {
    DEBUG_PRINTF("building mcclellan 16\n");

    vector<u32> reports; /* index in ri for the appropriate report list */
@ -497,7 +496,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    char *nfa_base = (char *)nfa.get();

    populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
@ -685,9 +684,8 @@ void allocateFSN8(dfa_info &info,
 }

 static
-aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
-                                          const CompileContext &cc,
-                                          set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile8(dfa_info &info, const CompileContext &cc,
+                                    set<dstate_id_t> *accel_states) {
    DEBUG_PRINTF("building mcclellan 8\n");

    vector<u32> reports;
@ -717,12 +715,13 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    char *nfa_base = (char *)nfa.get();

    mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get());

-    allocateFSN8(info, accel_escape_info, &m->accel_limit_8, &m->accept_limit_8);
+    allocateFSN8(info, accel_escape_info, &m->accel_limit_8,
+                 &m->accept_limit_8);
    populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
                      accel_escape_info.size(), arb, single, nfa.get());

@ -763,7 +762,7 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
 #define MAX_SHERMAN_LIST_LEN 8

 static
-void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+void addIfEarlier(flat_set<dstate_id_t> &dest, dstate_id_t candidate,
                  dstate_id_t max) {
    if (candidate < max) {
        dest.insert(candidate);
@ -771,19 +770,41 @@ void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
 }

 static
-void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+void addSuccessors(flat_set<dstate_id_t> &dest, const dstate &source,
                   u16 alphasize, dstate_id_t curr_id) {
    for (symbol_t s = 0; s < alphasize; s++) {
        addIfEarlier(dest, source.next[s], curr_id);
    }
 }

+/* \brief Returns a set of states to search for a better daddy. */
+static
+flat_set<dstate_id_t> find_daddy_candidates(const dfa_info &info,
+                                            dstate_id_t curr_id) {
+    flat_set<dstate_id_t> hinted;
+
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    // Add existing daddy and his successors, then search back one generation.
+    const u16 alphasize = info.impl_alpha_size;
+    dstate_id_t daddy = info.states[curr_id].daddy;
+    for (u32 level = 0; daddy && level < 2; level++) {
+        addIfEarlier(hinted, daddy, curr_id);
+        addSuccessors(hinted, info.states[daddy], alphasize, curr_id);
+        daddy = info.states[daddy].daddy;
+    }
+
+    return hinted;
+}
+
 #define MAX_SHERMAN_SELF_LOOP 20

 static
-void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
-                       bool using8bit, bool any_cyclic_near_anchored_state,
-                       const Grey &grey) {
+void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
+                       bool any_cyclic_near_anchored_state,
+                       bool trust_daddy_states, const Grey &grey) {
    if (!grey.allowShermanStates) {
        return;
    }
@ -818,21 +839,21 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
    dstate_id_t best_daddy = 0;
    dstate &currState = info.states[curr_id];

-    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
-    addIfEarlier(hinted, 0, curr_id);
-    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
-    addIfEarlier(hinted, info.raw.start_floating, curr_id);
-
-    dstate_id_t mydaddy = currState.daddy;
-    if (mydaddy) {
-        addIfEarlier(hinted, mydaddy, curr_id);
-        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
-        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
-        if (mygranddaddy) {
-            addIfEarlier(hinted, mygranddaddy, curr_id);
-            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
-                          curr_id);
+    flat_set<dstate_id_t> hinted;
+    if (trust_daddy_states) {
+        // Use the daddy already set for this state so long as it isn't already
+        // a Sherman state.
+        if (!info.is_sherman(currState.daddy)) {
+            hinted.insert(currState.daddy);
+        } else {
+            // Fall back to granddaddy, which has already been processed (due
+            // to BFS ordering) and cannot be a Sherman state.
+            dstate_id_t granddaddy = info.states[currState.daddy].daddy;
+            assert(!info.is_sherman(granddaddy));
+            hinted.insert(granddaddy);
        }
+    } else {
+        hinted = find_daddy_candidates(info, curr_id);
    }

    for (const dstate_id_t &donor : hinted) {
@ -885,7 +906,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,

    if (self_loop_width > MAX_SHERMAN_SELF_LOOP) {
        DEBUG_PRINTF("%hu is banned wide self loop (%u)\n", curr_id,
-                      self_loop_width);
+                     self_loop_width);
        return;
    }

@ -939,9 +960,10 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
    return false;
 }

-aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
-                                           const CompileContext &cc,
-                                           set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
+                                     const CompileContext &cc,
+                                     bool trust_daddy_states,
+                                     set<dstate_id_t> *accel_states) {
    u16 total_daddy = 0;
    dfa_info info(strat);
    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
@ -957,7 +979,7 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &

    for (u32 i = 0; i < info.size(); i++) {
        find_better_daddy(info, i, using8bit, any_cyclic_near_anchored_state,
-                          cc.grey);
+                          trust_daddy_states, cc.grey);
        total_daddy += info.extra[i].daddytaken;
    }

@ -965,7 +987,7 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &
                 info.size() * info.impl_alpha_size, info.size(),
                 info.impl_alpha_size);

-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
    if (!using8bit) {
        nfa = mcclellanCompile16(info, cc, accel_states);
    } else {
@ -980,11 +1002,13 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &
    return nfa;
 }

-aligned_unique_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
-                                         const ReportManager &rm,
-                                         set<dstate_id_t> *accel_states) {
-    mcclellan_build_strat mbs(raw, rm);
-    return mcclellanCompile_i(raw, mbs, cc, accel_states);
+bytecode_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm,
+                                   bool only_accel_init,
+                                   bool trust_daddy_states,
+                                   set<dstate_id_t> *accel_states) {
+    mcclellan_build_strat mbs(raw, rm, only_accel_init);
+    return mcclellanCompile_i(raw, mbs, cc, trust_daddy_states, accel_states);
 }

 size_t mcclellan_build_strat::accelSize(void) const {
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -32,7 +32,7 @@
 #include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"

 #include <memory>
@ -48,14 +48,15 @@ struct CompileContext;

 class mcclellan_build_strat : public accel_dfa_build_strat {
 public:
-    mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
-        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+    mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in,
+                          bool only_accel_init_in)
+        : accel_dfa_build_strat(rm_in, only_accel_init_in), rdfa(rdfa_in) {}
    raw_dfa &get_raw() const override { return rdfa; }
    std::unique_ptr<raw_report_info> gatherReports(
                                  std::vector<u32> &reports /* out */,
                                  std::vector<u32> &reports_eod /* out */,
                                  u8 *isSingleReport /* out */,
-                                  ReportID *arbReport  /* out */) const override;
+                                  ReportID *arbReport /* out */) const override;
    size_t accelSize(void) const override;
    u32 max_allowed_offset_accel() const override;
    u32 max_stop_char() const override;
@ -65,17 +66,30 @@ private:
    raw_dfa &rdfa;
 };

-/* accel_states: (optional) on success, is filled with the set of accelerable
- * states */
-ue2::aligned_unique_ptr<NFA>
+/**
+ * \brief Construct an implementation DFA.
+ *
+ * \param raw the raw dfa to construct from
+ * \param cc compile context
+ * \param rm report manger
+ * \param only_accel_init if true, only the init states will be examined for
+ *        acceleration opportunities
+ * \param trust_daddy_states if true, trust the daddy state set in the raw dfa
+ *        rather than conducting a search for a better daddy (for Sherman
+ *        states)
+ * \param accel_states (optional) success, is filled with the set of
+ *        accelerable states
+ */
+bytecode_ptr<NFA>
 mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
-                 const ReportManager &rm,
+                 const ReportManager &rm, bool only_accel_init,
+                 bool trust_daddy_states = false,
                 std::set<dstate_id_t> *accel_states = nullptr);

 /* used internally by mcclellan/haig/gough compile process */
-ue2::aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
-                   const CompileContext &cc,
+                   const CompileContext &cc, bool trust_daddy_states = false,
                   std::set<dstate_id_t> *accel_states = nullptr);

 /**
@ -89,4 +103,4 @@ bool has_accel_mcclellan(const NFA *nfa);

 } // namespace ue2

-#endif
+#endif // MCCLELLANCOMPILE_H
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -43,6 +43,12 @@ namespace ue2 {

 #define INIT_STATE 1

+static
+bool state_has_reports(const raw_dfa &raw, dstate_id_t s) {
+    const auto &ds = raw.states[s];
+    return !ds.reports.empty() || !ds.reports_eod.empty();
+}
+
 static
 u32 count_dots(const raw_dfa &raw) {
    assert(raw.start_anchored == INIT_STATE);
@ -60,8 +66,7 @@ u32 count_dots(const raw_dfa &raw) {
            }
        }

-        if (!raw.states[raw.states[i].next[0]].reports.empty()
-            || !raw.states[raw.states[i].next[0]].reports_eod.empty()) {
+        if (state_has_reports(raw, raw.states[i].next[0])) {
            goto validate;
        }

@ -162,74 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
    return last_d;
 }

-static
-void find_in_edges(const raw_dfa &raw, vector<vector<dstate_id_t> > *in_edges) {
-    in_edges->clear();
-    in_edges->resize(raw.states.size());
-    ue2::unordered_set<dstate_id_t> seen;
-
-    for (u32 s = 1; s < raw.states.size(); s++) {
-        seen.clear();
-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t t = raw.states[s].next[j];
-            if (contains(seen, t)) {
-                continue;
-            }
-            seen.insert(t);
-            (*in_edges)[t].push_back(s);
-        }
-    }
-}
-
-static
-void calc_min_dist_to_accept(const raw_dfa &raw,
-                             const vector<vector<dstate_id_t> > &in_edges,
-                             vector<u32> *accept_dist) {
-    vector<u32> &dist = *accept_dist;
-    dist.clear();
-    dist.resize(raw.states.size(), ~0U);
-
-    /* for reporting states to start from */
-    deque<dstate_id_t> to_visit;
-    for (u32 s = 0; s < raw.states.size(); s++) {
-        if (!raw.states[s].reports.empty()
-            || !raw.states[s].reports_eod.empty()) {
-            to_visit.push_back(s);
-            dist[s] = 0;
-        }
-    }
-
-    /* bfs */
-    UNUSED u32 last_d = 0;
-    while (!to_visit.empty()) {
-        dstate_id_t s = to_visit.front();
-        to_visit.pop_front();
-        assert(s != DEAD_STATE);
-
-        u32 d = dist[s];
-        assert(d >= last_d);
-        assert(d != ~0U);
-
-        for (vector<dstate_id_t>::const_iterator it = in_edges[s].begin();
-             it != in_edges[s].end(); ++it) {
-            dstate_id_t t = *it;
-            if (t == DEAD_STATE) {
-                continue;
-            }
-            if (dist[t] == ~0U) {
-                to_visit.push_back(t);
-                dist[t] = d + 1;
-            } else {
-                assert(dist[t] <= d + 1);
-            }
-        }
-
-        last_d = d;
-    }
-}
-
-bool prune_overlong(raw_dfa &raw, u32 max_offset) {
-    DEBUG_PRINTF("pruning to at most %u\n", max_offset);
+bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
+    DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
    vector<u32> bob_dist;
    u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);

@ -237,53 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
        return false;
    }

-    vector<vector<dstate_id_t> > in_edges;
-    find_in_edges(raw, &in_edges);
-
-    vector<u32> accept_dist;
-    calc_min_dist_to_accept(raw, in_edges, &accept_dist);
-
-    in_edges.clear();
-
-    /* look over the states and filter out any which cannot reach a report
-     * states before max_offset */
-    vector<dstate_id_t> new_ids(raw.states.size());
-    vector<dstate> new_states;
-    u32 count = 1;
-    new_states.push_back(raw.states[DEAD_STATE]);
-
+    bool changed = false;
    for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
-        if (bob_dist[s] + accept_dist[s] > max_offset) {
-            DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s],
-                          accept_dist[s]);
-            new_ids[s] = DEAD_STATE;
-        } else {
-            new_ids[s] = count++;
-            new_states.push_back(raw.states[s]);
-            assert(new_states.size() == count);
-            assert(new_ids[s] <= s);
+        if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
+            DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
+            auto &ds = raw.states[s];
+            ds.reports.clear();
+            ds.reports_eod.clear();
+            changed = true;
        }
    }

-    /* swap states */
-    DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
-    raw.states.swap(new_states);
-    new_states.clear();
-
-    /* update edges and daddys to refer to the new ids */
-    for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t old_t = raw.states[s].next[j];
-            raw.states[s].next[j] = new_ids[old_t];
-        }
-        raw.states[s].daddy = new_ids[raw.states[s].daddy];
-    }
-
-    /* update specials */
-    raw.start_floating = new_ids[raw.start_floating];
-    raw.start_anchored = new_ids[raw.start_anchored];
-
-    return true;
+    return changed;
 }

 set<ReportID> all_reports(const raw_dfa &rdfa) {
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -39,10 +39,12 @@ namespace ue2 {
 u32 remove_leading_dots(raw_dfa &raw);

 /**
- * Prunes any states which cannot be reached within max_offset from start of
- * stream. Returns false if no changes are made to the rdfa
+ * \brief Clear reports on any states that are deeper than \a max_offset from
+ * start of stream.
+ *
+ * Returns false if no changes are made to the DFA.
 */
-bool prune_overlong(raw_dfa &raw, u32 max_offset);
+bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);

 std::set<ReportID> all_reports(const raw_dfa &rdfa);
 bool has_eod_accepts(const raw_dfa &rdfa);
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
 #include "nfa_api.h"
 #include "nfa_api_queue.h"
 #include "nfa_internal.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/simd_utils.h"
@ -168,7 +169,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
     * extract a single copy of the state from the u32 for checking. */
    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;

-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
@ -176,20 +177,20 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
                 m->sheng_accel_limit, sheng_stop_limit);
 #endif

-#define SHENG_SINGLE_ITER do {                                          \
-        m128 shuffle_mask = masks[*(c++)];                              \
-        s = pshufb(shuffle_mask, s);                                    \
-        u32 s_gpr_x4 = movd(s); /* convert to u8 */                     \
-        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr);  \
-        if (s_gpr_x4 >= sheng_stop_limit_x4) {                          \
-            s_gpr = s_gpr_x4;                                           \
-            goto exit;                                                  \
-        }                                                               \
+#define SHENG_SINGLE_ITER do {                                             \
+        m128 shuffle_mask = masks[*(c++)];                                 \
+        s = pshufb_m128(shuffle_mask, s);                                  \
+        u32 s_gpr_x4 = movd(s); /* convert to u8 */                        \
+        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr_x4);  \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                             \
+            s_gpr = s_gpr_x4;                                              \
+            goto exit;                                                     \
+        }                                                                  \
    } while (0)

    u8 s_gpr;
    while (c < c_end) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
        /* This version uses pext for efficently bitbashing out scaled
         * versions of the bytes to process from a u64a */

@ -197,7 +198,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
        u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */
        data_bytes &= ~0xffULL; /* clear low bits for scale space */
        m128 shuffle_mask0 = load128((const char *)masks + cc0);
-        s = pshufb(shuffle_mask0, s);
+        s = pshufb_m128(shuffle_mask0, s);
        m128 s_max = s;
        m128 s_max0 = s_max;
        DEBUG_PRINTF("c %02llx --> s %hhu\n", cc0 >> 4, movd(s));
@ -207,7 +208,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
        u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]);    \
        assert(cc##iter == (u64a)c[iter] << 4);                         \
        m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \
-        s = pshufb(shuffle_mask##iter, s);                              \
+        s = pshufb_m128(shuffle_mask##iter, s);                         \
        if (do_accel && iter == 7) {                                    \
            /* in the final iteration we also have to check against accel */ \
            m128 s_temp = sadd_u8_m128(s, accel_delta);                 \
@ -287,19 +288,19 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
        assert(soft_c_end - c < SHENG_CHUNK);
        switch (soft_c_end - c) {
        case 7:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 6:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 5:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 4:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 3:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 2:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        case 1:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
        }
    }

--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -617,7 +617,7 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
 #define MAX_SHERMAN_LIST_LEN 8

 static
-void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+void addIfEarlier(flat_set<dstate_id_t> &dest, dstate_id_t candidate,
                  dstate_id_t max) {
    if (candidate < max) {
        dest.insert(candidate);
@ -625,13 +625,35 @@ void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
 }

 static
-void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+void addSuccessors(flat_set<dstate_id_t> &dest, const dstate &source,
                   u16 alphasize, dstate_id_t curr_id) {
    for (symbol_t s = 0; s < alphasize; s++) {
        addIfEarlier(dest, source.next[s], curr_id);
    }
 }

+/* \brief Returns a set of states to search for a better daddy. */
+static
+flat_set<dstate_id_t> find_daddy_candidates(const dfa_info &info,
+                                            dstate_id_t curr_id) {
+    flat_set<dstate_id_t> hinted;
+
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    // Add existing daddy and his successors, then search back one generation.
+    const u16 alphasize = info.impl_alpha_size;
+    dstate_id_t daddy = info.states[curr_id].daddy;
+    for (u32 level = 0; daddy && level < 2; level++) {
+        addIfEarlier(hinted, daddy, curr_id);
+        addSuccessors(hinted, info.states[daddy], alphasize, curr_id);
+        daddy = info.states[daddy].daddy;
+    }
+
+    return hinted;
+}
+
 #define MAX_SHERMAN_SELF_LOOP 20

 static
@ -671,22 +693,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
    dstate_id_t best_daddy = 0;
    dstate &currState = info.states[curr_id];

-    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
-    addIfEarlier(hinted, 0, curr_id);
-    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
-    addIfEarlier(hinted, info.raw.start_floating, curr_id);
-
-    dstate_id_t mydaddy = currState.daddy;
-    if (mydaddy) {
-        addIfEarlier(hinted, mydaddy, curr_id);
-        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
-        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
-        if (mygranddaddy) {
-            addIfEarlier(hinted, mygranddaddy, curr_id);
-            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
-                          curr_id);
-        }
-    }
+    flat_set<dstate_id_t> hinted = find_daddy_candidates(info, curr_id);

    for (const dstate_id_t &donor : hinted) {
        assert(donor < curr_id);
@ -821,7 +828,7 @@ void fill_in_sherman(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
 }

 static
-aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
+bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
                        const Grey &grey) {
    DEBUG_PRINTF("building mcsheng 16\n");
@ -872,7 +879,7 @@ aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());

    populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
@ -967,7 +974,7 @@ void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
 }

 static
-aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
+bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
                       const map<dstate_id_t, AccelScheme> &accel_escape_info) {
    DEBUG_PRINTF("building mcsheng 8\n");

@ -998,7 +1005,7 @@ aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());

    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
@ -1019,13 +1026,13 @@ aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
    return nfa;
 }

-aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
-                                       const ReportManager &rm) {
+bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm) {
    if (!cc.grey.allowMcSheng) {
        return nullptr;
    }

-    mcclellan_build_strat mbs(raw, rm);
+    mcclellan_build_strat mbs(raw, rm, false);
    dfa_info info(mbs);
    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;

@ -1044,7 +1051,7 @@ aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
        return nullptr;
    }

-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
    if (!using8bit) {
        nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
    } else {
--- a/src/nfa/mcsheng_compile.h
+++ b/src/nfa/mcsheng_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,13 +29,8 @@
 #ifndef MCSHENGCOMPILE_H
 #define MCSHENGCOMPILE_H

-#include "accel_dfa_build_strat.h"
-#include "rdfa.h"
 #include "ue2common.h"
-#include "util/alloc.h"
-#include "util/ue2_containers.h"
-
-#include <memory>
+#include "util/bytecode_ptr.h"

 struct NFA;

@ -43,10 +38,10 @@ namespace ue2 {

 class ReportManager;
 struct CompileContext;
+struct raw_dfa;

-ue2::aligned_unique_ptr<NFA>
-mcshengCompile(raw_dfa &raw, const CompileContext &cc,
-               const ReportManager &rm);
+bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm);

 bool has_accel_mcsheng(const NFA *nfa);

--- a/src/nfa/mpvcompile.cpp
+++ b/src/nfa/mpvcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -309,9 +309,9 @@ const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters,
    return counters.front();
 }

-aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
-                                   const vector<raw_puff> &triggered_puffs,
-                                   const ReportManager &rm) {
+bytecode_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
+                             const vector<raw_puff> &triggered_puffs,
+                             const ReportManager &rm) {
    assert(!puffs_in.empty() || !triggered_puffs.empty());
    u32 puffette_count = puffs_in.size() + triggered_puffs.size();

@ -343,7 +343,7 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,

    DEBUG_PRINTF("%u puffs, len = %u\n", puffette_count, len);

-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(len);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(len);

    mpv_puffette *pa_base = (mpv_puffette *)
        ((char *)nfa.get() + sizeof(NFA) + sizeof(mpv)
--- a/src/nfa/mpvcompile.h
+++ b/src/nfa/mpvcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,7 +30,7 @@
 #define MPV_COMPILE_H

 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"

 #include <memory>
@ -61,9 +61,9 @@ struct raw_puff {
 * puffs in the triggered_puffs vector are enabled when an TOP_N event is
 * delivered corresponding to their index in the vector
 */
-aligned_unique_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
-                                   const std::vector<raw_puff> &triggered_puffs,
-                                   const ReportManager &rm);
+bytecode_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
+                             const std::vector<raw_puff> &triggered_puffs,
+                             const ReportManager &rm);

 } // namespace ue2

--- a/src/nfa/multiaccel_common.h
+++ b/src/nfa/multiaccel_common.h
@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_COMMON_H_
-#define MULTIACCEL_COMMON_H_
-
-#include "config.h"
-#include "ue2common.h"
-#include "util/join.h"
-#include "util/bitutils.h"
-
-/*
- * When doing shifting, remember that the total number of shifts should be n-1
- */
-#define VARISHIFT(src, dst, len) \
-    do { \
-        (dst) &= (src) >> (len); \
-    } while (0)
-#define STATIC_SHIFT1(x) \
-    do { \
-        (x) &= (x) >> 1; \
-    } while (0)
-#define STATIC_SHIFT2(x) \
-    do { \
-        (x) &= (x) >> 2;\
-    } while (0)
-#define STATIC_SHIFT4(x) \
-    do { \
-        (x) &= (x) >> 4; \
-    } while (0)
-#define STATIC_SHIFT8(x) \
-    do { \
-        (x) &= (x) >> 8; \
-    } while (0)
-#define SHIFT1(x) \
-    do {} while (0)
-#define SHIFT2(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT3(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT4(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT5(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT6(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT7(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT8(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT9(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT10(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT11(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT12(x); \
-    do { \
-        SHIFT8(x);\
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT13(x); \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT14(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT15(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT16(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-#define SHIFT17(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT18(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT19(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT20(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT21(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT22(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT23(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT24(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-#define SHIFT25(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT26(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT27(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT28(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT29(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT30(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT31(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT32(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-
-/*
- * this function is used by 32-bit multiaccel matchers. 32-bit matchers accept
- * a 32-bit integer as a buffer, where low 16 bits is movemask result and
- * high 16 bits are "don't care" values. this function is not expected to return
- * a result higher than 16.
- */
-static really_inline
-const u8 *match32(const u8 *buf, const u32 z) {
-    if (unlikely(z != 0)) {
-        u32 pos = ctz32(z);
-        assert(pos < 16);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-/*
- * this function is used by 64-bit multiaccel matchers. 64-bit matchers accept
- * a 64-bit integer as a buffer, where low 32 bits is movemask result and
- * high 32 bits are "don't care" values. this function is not expected to return
- * a result higher than 32.
- */
-static really_inline
-const u8 *match64(const u8 *buf, const u64a z) {
-    if (unlikely(z != 0)) {
-        u32 pos = ctz64(z);
-        assert(pos < 32);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-#endif /* MULTIACCEL_COMMON_H_ */
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "multiaccel_compilehelper.h"
-
-using namespace std;
-using namespace ue2;
-
-#ifdef DEBUG
-static const char* state_to_str[] = {
-    "FIRST_RUN",
-    "SECOND_RUN",
-    "WAITING_FOR_GRAB",
-    "FIRST_TAIL",
-    "SECOND_TAIL",
-    "STOPPED",
-    "INVALID"
-};
-static const char* type_to_str[] = {
-    "SHIFT",
-    "SHIFTGRAB",
-    "DOUBLESHIFT",
-    "DOUBLESHIFTGRAB",
-    "LONG",
-    "LONGGRAB",
-    "NONE"
-};
-
-static
-void dumpMultiaccelState(const accel_data &d) {
-    DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
-                 type_to_str[(unsigned) d.type],
-                 state_to_str[(unsigned) d.state],
-                 d.len1, d.tlen1, d.len2, d.tlen2);
-}
-#endif
-
-/* stop all the matching. this may render most schemes invalid. */
-static
-void stop(accel_data &d) {
-    switch (d.state) {
-    case STATE_STOPPED:
-    case STATE_INVALID:
-        break;
-    case STATE_FIRST_TAIL:
-    case STATE_SECOND_RUN:
-        /*
-         * Shift matchers are special case, because they have "tails".
-         * When shift matcher reaches a mid/endpoint, tail mode is
-         * activated, which looks for more matches to extend the match.
-         *
-         * For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
-         * long-grab matcher will be picked for this pattern (matching a run of a's,
-         * followed by a not-a), because doubleshift matcher would be confused by
-         * consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
-         * by 1) and throw out the rest of the pattern.
-         *
-         * With tails, we defer ending the run until we actually run out of
-         * matching characters, so the above pattern will now be parsed by
-         * doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
-         *
-         * So if we are stopping shift matchers, we should check if we aren't in
-         * the process of matching first tail or second run. If we are, we can't
-         * finish the second run as we are stopping, but we can try and split
-         * the first tail instead to obtain a valid second run.
-         */
-        if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
-            // can't split an empty void...
-            d.state = STATE_INVALID;
-            break;
-        }
-        d.len2 = 0;
-        d.state = STATE_STOPPED;
-        break;
-    case STATE_SECOND_TAIL:
-        d.state = STATE_STOPPED;
-        break;
-    case STATE_WAITING_FOR_GRAB:
-    case STATE_FIRST_RUN:
-        if (d.type == MultibyteAccelInfo::MAT_LONG) {
-            d.state = STATE_STOPPED;
-        } else {
-            d.state = STATE_INVALID;
-        }
-        break;
-    }
-}
-
-static
-void validate(accel_data &d, unsigned max_len) {
-    // try and fit in all our tails
-    if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
-        // case 1: everything fits in
-        d.len1 += d.tlen1;
-        d.len2 += d.tlen2;
-        d.tlen1 = 0;
-        d.tlen2 = 0;
-    } else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
-        // case 2: everything but the second tail fits in
-        d.len1 += d.tlen1;
-        d.tlen1 = 0;
-        // try going for a partial tail
-        if (d.tlen2 != 0) {
-            int new_tlen2 = max_len - 1 - d.len1 - d.len2;
-            if (new_tlen2 > 0) {
-                d.len2 += new_tlen2;
-            }
-            d.tlen2 = 0;
-        }
-    } else if (d.len1 + d.tlen1 < max_len) {
-        // case 3: first run and its tail fits in
-        if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
-            // split the tail into a second run
-            d.len2 = d.tlen1;
-        } else {
-            d.len1 += d.tlen1;
-            d.len2 = 0;
-        }
-        d.tlen1 = 0;
-        d.tlen2 = 0;
-    } else if (d.len1 < max_len) {
-        // case 4: nothing but the first run fits in
-        // try going for a partial tail
-        if (d.tlen1 != 0) {
-            int new_tlen1 = max_len - 1 - d.len1;
-            if (new_tlen1 > 0) {
-                d.len1 += new_tlen1;
-            }
-            d.tlen1 = 0;
-        }
-        d.len2 = 0;
-        d.tlen2 = 0;
-    }
-    // if we removed our second run, doubleshift matchers are no longer valid
-    if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
-        d.state = STATE_INVALID;
-    } else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
-        // long matchers can just stop whenever they want to
-        d.len1 = max_len - 1;
-    }
-
-    // now, general sanity checks
-    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
-        d.state = STATE_INVALID;
-    }
-    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
-        d.state = STATE_INVALID;
-    }
-}
-
-static
-void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
-    switch (d.type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        {
-            /*
-             * For long matcher, we want lots of consecutive same-or-subset
-             * char-reaches
-             */
-            if ((ref_cr & cur_cr) == cur_cr) {
-                d.len1++;
-            } else {
-                d.state = STATE_STOPPED;
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        {
-            /*
-             * For long-grab matcher, we want lots of consecutive same-or-subset
-             * char-reaches with a negative match in the end.
-             */
-            if ((ref_cr & cur_cr) == cur_cr) {
-                d.len1++;
-            } else if (!(ref_cr & cur_cr).any()) {
-                /* we grabbed, stop immediately */
-                d.state = STATE_STOPPED;
-            } else {
-                /* our run-n-grab was interrupted; mark as invalid */
-                d.state = STATE_INVALID;
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        {
-            /*
-             * For shift-grab matcher, we want two matches separated by anything;
-             * however the second vertex *must* be a negative (non-overlapping) match.
-             *
-             * Shiftgrab matcher is identical to shift except for presence of grab.
-             */
-            if (d.state == STATE_WAITING_FOR_GRAB) {
-                if ((ref_cr & cur_cr).any()) {
-                    d.state = STATE_INVALID;
-                } else {
-                    d.state = STATE_FIRST_RUN;
-                    d.len1++;
-                }
-                return;
-            }
-        }
-        /* no break, falling through */
-    case MultibyteAccelInfo::MAT_SHIFT:
-        {
-            /*
-             * For shift-matcher, we want two matches separated by anything.
-             */
-            if (ref_cr == cur_cr) {
-                // keep matching tail
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.state = STATE_FIRST_TAIL;
-                    break;
-                case STATE_FIRST_TAIL:
-                    d.tlen1++;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            } else {
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    // simply advance
-                    d.len1++;
-                    break;
-                case STATE_FIRST_TAIL:
-                    // we found a non-matching char after tail, so stop
-                    d.state = STATE_STOPPED;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        {
-            /*
-             * For double shift-grab matcher, we want two matches separated by
-             * either negative matches or dots; however the second vertex *must*
-             * be a negative match.
-             *
-             * Doubleshiftgrab matcher is identical to doubleshift except for
-             * presence of grab.
-             */
-            if (d.state == STATE_WAITING_FOR_GRAB) {
-                if ((ref_cr & cur_cr).any()) {
-                    d.state = STATE_INVALID;
-                } else {
-                    d.state = STATE_FIRST_RUN;
-                    d.len1++;
-                }
-                return;
-            }
-        }
-        /* no break, falling through */
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        {
-            /*
-             * For double shift matcher, we want three matches, each separated
-             * by a lot of anything.
-             *
-             * Doubleshift matcher is complicated by presence of tails.
-             */
-            if (ref_cr == cur_cr) {
-                // decide if we are activating second shift or matching tails
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.state = STATE_FIRST_TAIL;
-                    d.len2 = 1; // we're now ready for our second run
-                    break;
-                case STATE_FIRST_TAIL:
-                    d.tlen1++;
-                    break;
-                case STATE_SECOND_RUN:
-                    d.state = STATE_SECOND_TAIL;
-                    break;
-                case STATE_SECOND_TAIL:
-                    d.tlen2++;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            } else {
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.len1++;
-                    break;
-                case STATE_FIRST_TAIL:
-                    // start second run
-                    d.state = STATE_SECOND_RUN;
-                    d.len2++;
-                    break;
-                case STATE_SECOND_RUN:
-                    d.len2++;
-                    break;
-                case STATE_SECOND_TAIL:
-                    // stop
-                    d.state = STATE_STOPPED;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            }
-        }
-        break;
-
-    default:
-        // shouldn't happen
-        assert(0);
-        break;
-    }
-}
-
-MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr,
-                                                 u32 off, unsigned max_length)
-    : cr(ref_cr), offset(off), max_len(max_length) {
-    int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
-    accels.resize(accel_num);
-
-    // mark everything as valid
-    for (int i = 0; i < accel_num; i++) {
-        accel_data &ad = accels[i];
-        ad.len1 = 1;
-        ad.type = (MultibyteAccelInfo::multiaccel_type) i;
-
-        /* for shift-grab matchers, we are waiting for the grab right at the start */
-        if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
-                || ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
-            ad.state = STATE_WAITING_FOR_GRAB;
-        } else {
-            ad.state = STATE_FIRST_RUN;
-        }
-    }
-}
-
-bool MultiaccelCompileHelper::canAdvance() {
-    for (const accel_data &ad : accels) {
-        if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
-            return true;
-        }
-    }
-    return false;
-}
-
-void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
-    for (accel_data &ad : accels) {
-        if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
-            continue;
-        }
-        match(ad, cr, cur_cr);
-#ifdef DEBUG
-        dumpMultiaccelState(ad);
-#endif
-    }
-}
-
-MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
-    int best_len = 0;
-    accel_data best;
-
-    DEBUG_PRINTF("Stopping multiaccel compile\n");
-
-    for (accel_data &ad : accels) {
-        // stop our matching
-        stop(ad);
-        validate(ad, max_len);
-
-#ifdef DEBUG
-        dumpMultiaccelState(ad);
-#endif
-
-        // skip invalid schemes
-        if (ad.state == STATE_INVALID) {
-            continue;
-        }
-        DEBUG_PRINTF("Marking as viable\n");
-
-        // TODO: relative strengths of accel schemes? maybe e.g. a shorter
-        // long match would in some cases be preferable to a longer
-        // double shift match (for example, depending on length)?
-        int as_len = ad.len1 + ad.len2;
-        if (as_len >= best_len) {
-            DEBUG_PRINTF("Marking as best\n");
-            best_len = as_len;
-            best = ad;
-        }
-    }
-    // if we found at least one accel scheme, return it
-    if (best.state != STATE_INVALID) {
-#ifdef DEBUG
-        DEBUG_PRINTF("Picked best multiaccel state:\n");
-        dumpMultiaccelState(best);
-#endif
-        MultibyteAccelInfo info;
-        info.cr = cr;
-        info.offset = offset;
-        info.len1 = best.len1;
-        info.len2 = best.len2;
-        info.type = best.type;
-        return info;
-    }
-    return MultibyteAccelInfo();
-}
--- a/src/nfa/multiaccel_doubleshift.h
+++ b/src/nfa/multiaccel_doubleshift.h
@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_DOUBLESHIFT_H_
-#define MULTIACCEL_DOUBLESHIFT_H_
-
-#include "multiaccel_common.h"
-
-#define DOUBLESHIFT_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(doubleshiftMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
-        if (unlikely(z)) { \
-            match_t tmp = z; \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            VARISHIFT(tmp, tmp, len2); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define DOUBLESHIFT_MATCH_32_DEF(n) \
-        DOUBLESHIFT_MATCH(n, u32, 32)
-#define DOUBLESHIFT_MATCH_64_DEF(n) \
-        DOUBLESHIFT_MATCH(n, u64a, 64)
-#define DOUBLESHIFT_MATCH_DEF(n) \
-    DOUBLESHIFT_MATCH_32_DEF(n) \
-    DOUBLESHIFT_MATCH_64_DEF(n)
-
-DOUBLESHIFT_MATCH_DEF(1)
-DOUBLESHIFT_MATCH_DEF(2)
-DOUBLESHIFT_MATCH_DEF(3)
-DOUBLESHIFT_MATCH_DEF(4)
-DOUBLESHIFT_MATCH_DEF(5)
-DOUBLESHIFT_MATCH_DEF(6)
-DOUBLESHIFT_MATCH_DEF(7)
-DOUBLESHIFT_MATCH_DEF(8)
-DOUBLESHIFT_MATCH_DEF(9)
-DOUBLESHIFT_MATCH_DEF(10)
-DOUBLESHIFT_MATCH_DEF(11)
-DOUBLESHIFT_MATCH_DEF(12)
-DOUBLESHIFT_MATCH_DEF(13)
-DOUBLESHIFT_MATCH_DEF(14)
-DOUBLESHIFT_MATCH_DEF(15)
-DOUBLESHIFT_MATCH_64_DEF(16)
-DOUBLESHIFT_MATCH_64_DEF(17)
-DOUBLESHIFT_MATCH_64_DEF(18)
-DOUBLESHIFT_MATCH_64_DEF(19)
-DOUBLESHIFT_MATCH_64_DEF(20)
-DOUBLESHIFT_MATCH_64_DEF(21)
-DOUBLESHIFT_MATCH_64_DEF(22)
-DOUBLESHIFT_MATCH_64_DEF(23)
-DOUBLESHIFT_MATCH_64_DEF(24)
-DOUBLESHIFT_MATCH_64_DEF(25)
-DOUBLESHIFT_MATCH_64_DEF(26)
-DOUBLESHIFT_MATCH_64_DEF(27)
-DOUBLESHIFT_MATCH_64_DEF(28)
-DOUBLESHIFT_MATCH_64_DEF(29)
-DOUBLESHIFT_MATCH_64_DEF(30)
-DOUBLESHIFT_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*doubleshift_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftMatch_32_1,
-    &doubleshiftMatch_32_2,
-    &doubleshiftMatch_32_3,
-    &doubleshiftMatch_32_4,
-    &doubleshiftMatch_32_5,
-    &doubleshiftMatch_32_6,
-    &doubleshiftMatch_32_7,
-    &doubleshiftMatch_32_8,
-    &doubleshiftMatch_32_9,
-    &doubleshiftMatch_32_10,
-    &doubleshiftMatch_32_11,
-    &doubleshiftMatch_32_12,
-    &doubleshiftMatch_32_13,
-    &doubleshiftMatch_32_14,
-    &doubleshiftMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*doubleshift_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftMatch_64_1,
-    &doubleshiftMatch_64_2,
-    &doubleshiftMatch_64_3,
-    &doubleshiftMatch_64_4,
-    &doubleshiftMatch_64_5,
-    &doubleshiftMatch_64_6,
-    &doubleshiftMatch_64_7,
-    &doubleshiftMatch_64_8,
-    &doubleshiftMatch_64_9,
-    &doubleshiftMatch_64_10,
-    &doubleshiftMatch_64_11,
-    &doubleshiftMatch_64_12,
-    &doubleshiftMatch_64_13,
-    &doubleshiftMatch_64_14,
-    &doubleshiftMatch_64_15,
-    &doubleshiftMatch_64_16,
-    &doubleshiftMatch_64_17,
-    &doubleshiftMatch_64_18,
-    &doubleshiftMatch_64_19,
-    &doubleshiftMatch_64_20,
-    &doubleshiftMatch_64_21,
-    &doubleshiftMatch_64_22,
-    &doubleshiftMatch_64_23,
-    &doubleshiftMatch_64_24,
-    &doubleshiftMatch_64_25,
-    &doubleshiftMatch_64_26,
-    &doubleshiftMatch_64_27,
-    &doubleshiftMatch_64_28,
-    &doubleshiftMatch_64_29,
-    &doubleshiftMatch_64_30,
-    &doubleshiftMatch_64_31,
-};
-
-#endif /* MULTIACCEL_DOUBLESHIFT_H_ */
--- a/src/nfa/multiaccel_doubleshiftgrab.h
+++ b/src/nfa/multiaccel_doubleshiftgrab.h
@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_DOUBLESHIFTGRAB_H_
-#define MULTIACCEL_DOUBLESHIFTGRAB_H_
-
-#include "multiaccel_common.h"
-
-#define DOUBLESHIFTGRAB_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(doubleshiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
-        if (unlikely(z)) { \
-            match_t neg = ~z; \
-            match_t tmp = z; \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
-            neg |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            VARISHIFT(tmp, tmp, len2); \
-            VARISHIFT(neg, z, 1); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
-        DOUBLESHIFTGRAB_MATCH(n, u32, 32)
-#define DOUBLESHIFTGRAB_MATCH_64_DEF(n) \
-        DOUBLESHIFTGRAB_MATCH(n, u64a, 64)
-#define DOUBLESHIFTGRAB_MATCH_DEF(n) \
-    DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
-    DOUBLESHIFTGRAB_MATCH_64_DEF(n)
-
-DOUBLESHIFTGRAB_MATCH_DEF(1)
-DOUBLESHIFTGRAB_MATCH_DEF(2)
-DOUBLESHIFTGRAB_MATCH_DEF(3)
-DOUBLESHIFTGRAB_MATCH_DEF(4)
-DOUBLESHIFTGRAB_MATCH_DEF(5)
-DOUBLESHIFTGRAB_MATCH_DEF(6)
-DOUBLESHIFTGRAB_MATCH_DEF(7)
-DOUBLESHIFTGRAB_MATCH_DEF(8)
-DOUBLESHIFTGRAB_MATCH_DEF(9)
-DOUBLESHIFTGRAB_MATCH_DEF(10)
-DOUBLESHIFTGRAB_MATCH_DEF(11)
-DOUBLESHIFTGRAB_MATCH_DEF(12)
-DOUBLESHIFTGRAB_MATCH_DEF(13)
-DOUBLESHIFTGRAB_MATCH_DEF(14)
-DOUBLESHIFTGRAB_MATCH_DEF(15)
-DOUBLESHIFTGRAB_MATCH_64_DEF(16)
-DOUBLESHIFTGRAB_MATCH_64_DEF(17)
-DOUBLESHIFTGRAB_MATCH_64_DEF(18)
-DOUBLESHIFTGRAB_MATCH_64_DEF(19)
-DOUBLESHIFTGRAB_MATCH_64_DEF(20)
-DOUBLESHIFTGRAB_MATCH_64_DEF(21)
-DOUBLESHIFTGRAB_MATCH_64_DEF(22)
-DOUBLESHIFTGRAB_MATCH_64_DEF(23)
-DOUBLESHIFTGRAB_MATCH_64_DEF(24)
-DOUBLESHIFTGRAB_MATCH_64_DEF(25)
-DOUBLESHIFTGRAB_MATCH_64_DEF(26)
-DOUBLESHIFTGRAB_MATCH_64_DEF(27)
-DOUBLESHIFTGRAB_MATCH_64_DEF(28)
-DOUBLESHIFTGRAB_MATCH_64_DEF(29)
-DOUBLESHIFTGRAB_MATCH_64_DEF(30)
-DOUBLESHIFTGRAB_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*doubleshiftgrab_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftgrabMatch_32_1,
-    &doubleshiftgrabMatch_32_2,
-    &doubleshiftgrabMatch_32_3,
-    &doubleshiftgrabMatch_32_4,
-    &doubleshiftgrabMatch_32_5,
-    &doubleshiftgrabMatch_32_6,
-    &doubleshiftgrabMatch_32_7,
-    &doubleshiftgrabMatch_32_8,
-    &doubleshiftgrabMatch_32_9,
-    &doubleshiftgrabMatch_32_10,
-    &doubleshiftgrabMatch_32_11,
-    &doubleshiftgrabMatch_32_12,
-    &doubleshiftgrabMatch_32_13,
-    &doubleshiftgrabMatch_32_14,
-    &doubleshiftgrabMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*doubleshiftgrab_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftgrabMatch_64_1,
-    &doubleshiftgrabMatch_64_2,
-    &doubleshiftgrabMatch_64_3,
-    &doubleshiftgrabMatch_64_4,
-    &doubleshiftgrabMatch_64_5,
-    &doubleshiftgrabMatch_64_6,
-    &doubleshiftgrabMatch_64_7,
-    &doubleshiftgrabMatch_64_8,
-    &doubleshiftgrabMatch_64_9,
-    &doubleshiftgrabMatch_64_10,
-    &doubleshiftgrabMatch_64_11,
-    &doubleshiftgrabMatch_64_12,
-    &doubleshiftgrabMatch_64_13,
-    &doubleshiftgrabMatch_64_14,
-    &doubleshiftgrabMatch_64_15,
-    &doubleshiftgrabMatch_64_16,
-    &doubleshiftgrabMatch_64_17,
-    &doubleshiftgrabMatch_64_18,
-    &doubleshiftgrabMatch_64_19,
-    &doubleshiftgrabMatch_64_20,
-    &doubleshiftgrabMatch_64_21,
-    &doubleshiftgrabMatch_64_22,
-    &doubleshiftgrabMatch_64_23,
-    &doubleshiftgrabMatch_64_24,
-    &doubleshiftgrabMatch_64_25,
-    &doubleshiftgrabMatch_64_26,
-    &doubleshiftgrabMatch_64_27,
-    &doubleshiftgrabMatch_64_28,
-    &doubleshiftgrabMatch_64_29,
-    &doubleshiftgrabMatch_64_30,
-    &doubleshiftgrabMatch_64_31,
-};
-
-#endif /* MULTIACCEL_DOUBLESHIFTGRAB_H_ */
--- a/src/nfa/multiaccel_long.h
+++ b/src/nfa/multiaccel_long.h
@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_LONG_H_
-#define MULTIACCEL_LONG_H_
-
-#include "multiaccel_common.h"
-
-#define LONG_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(longMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
-        if (unlikely(z)) { \
-            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
-            JOIN(SHIFT, len)(z); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define LONG_MATCH_32_DEF(n) \
-        LONG_MATCH(n, u32, 32)
-#define LONG_MATCH_64_DEF(n) \
-        LONG_MATCH(n, u64a, 64)
-#define LONG_MATCH_DEF(n) \
-    LONG_MATCH_32_DEF(n) \
-    LONG_MATCH_64_DEF(n)
-
-LONG_MATCH_DEF(1)
-LONG_MATCH_DEF(2)
-LONG_MATCH_DEF(3)
-LONG_MATCH_DEF(4)
-LONG_MATCH_DEF(5)
-LONG_MATCH_DEF(6)
-LONG_MATCH_DEF(7)
-LONG_MATCH_DEF(8)
-LONG_MATCH_DEF(9)
-LONG_MATCH_DEF(10)
-LONG_MATCH_DEF(11)
-LONG_MATCH_DEF(12)
-LONG_MATCH_DEF(13)
-LONG_MATCH_DEF(14)
-LONG_MATCH_DEF(15)
-LONG_MATCH_64_DEF(16)
-LONG_MATCH_64_DEF(17)
-LONG_MATCH_64_DEF(18)
-LONG_MATCH_64_DEF(19)
-LONG_MATCH_64_DEF(20)
-LONG_MATCH_64_DEF(21)
-LONG_MATCH_64_DEF(22)
-LONG_MATCH_64_DEF(23)
-LONG_MATCH_64_DEF(24)
-LONG_MATCH_64_DEF(25)
-LONG_MATCH_64_DEF(26)
-LONG_MATCH_64_DEF(27)
-LONG_MATCH_64_DEF(28)
-LONG_MATCH_64_DEF(29)
-LONG_MATCH_64_DEF(30)
-LONG_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 *(*long_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-    // skip the first three
-     0,
-     &longMatch_32_1,
-     &longMatch_32_2,
-     &longMatch_32_3,
-     &longMatch_32_4,
-     &longMatch_32_5,
-     &longMatch_32_6,
-     &longMatch_32_7,
-     &longMatch_32_8,
-     &longMatch_32_9,
-     &longMatch_32_10,
-     &longMatch_32_11,
-     &longMatch_32_12,
-     &longMatch_32_13,
-     &longMatch_32_14,
-     &longMatch_32_15,
- };
-
-static
-const UNUSED u8 *(*long_match_funcs_64[])(const u8 *buf, u64a z) =
-{
-// skip the first three
-    0,
-    &longMatch_64_1,
-    &longMatch_64_2,
-    &longMatch_64_3,
-    &longMatch_64_4,
-    &longMatch_64_5,
-    &longMatch_64_6,
-    &longMatch_64_7,
-    &longMatch_64_8,
-    &longMatch_64_9,
-    &longMatch_64_10,
-    &longMatch_64_11,
-    &longMatch_64_12,
-    &longMatch_64_13,
-    &longMatch_64_14,
-    &longMatch_64_15,
-    &longMatch_64_16,
-    &longMatch_64_17,
-    &longMatch_64_18,
-    &longMatch_64_19,
-    &longMatch_64_20,
-    &longMatch_64_21,
-    &longMatch_64_22,
-    &longMatch_64_23,
-    &longMatch_64_24,
-    &longMatch_64_25,
-    &longMatch_64_26,
-    &longMatch_64_27,
-    &longMatch_64_28,
-    &longMatch_64_29,
-    &longMatch_64_30,
-    &longMatch_64_31,
-};
-
-#endif /* MULTIACCEL_LONG_H_ */
--- a/src/nfa/multiaccel_longgrab.h
+++ b/src/nfa/multiaccel_longgrab.h
@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_LONGGRAB_H_
-#define MULTIACCEL_LONGGRAB_H_
-
-#include "multiaccel_common.h"
-
-#define LONGGRAB_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(longgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
-        if (unlikely(z)) { \
-            match_t tmp = ~z; \
-            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
-            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
-            JOIN(SHIFT, len)(z); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define LONGGRAB_MATCH_32_DEF(n) \
-        LONGGRAB_MATCH(n, u32, 32)
-#define LONGGRAB_MATCH_64_DEF(n) \
-        LONGGRAB_MATCH(n, u64a, 64)
-#define LONGGRAB_MATCH_DEF(n) \
-    LONGGRAB_MATCH_32_DEF(n) \
-    LONGGRAB_MATCH_64_DEF(n)
-
-LONGGRAB_MATCH_DEF(1)
-LONGGRAB_MATCH_DEF(2)
-LONGGRAB_MATCH_DEF(3)
-LONGGRAB_MATCH_DEF(4)
-LONGGRAB_MATCH_DEF(5)
-LONGGRAB_MATCH_DEF(6)
-LONGGRAB_MATCH_DEF(7)
-LONGGRAB_MATCH_DEF(8)
-LONGGRAB_MATCH_DEF(9)
-LONGGRAB_MATCH_DEF(10)
-LONGGRAB_MATCH_DEF(11)
-LONGGRAB_MATCH_DEF(12)
-LONGGRAB_MATCH_DEF(13)
-LONGGRAB_MATCH_DEF(14)
-LONGGRAB_MATCH_DEF(15)
-LONGGRAB_MATCH_64_DEF(16)
-LONGGRAB_MATCH_64_DEF(17)
-LONGGRAB_MATCH_64_DEF(18)
-LONGGRAB_MATCH_64_DEF(19)
-LONGGRAB_MATCH_64_DEF(20)
-LONGGRAB_MATCH_64_DEF(21)
-LONGGRAB_MATCH_64_DEF(22)
-LONGGRAB_MATCH_64_DEF(23)
-LONGGRAB_MATCH_64_DEF(24)
-LONGGRAB_MATCH_64_DEF(25)
-LONGGRAB_MATCH_64_DEF(26)
-LONGGRAB_MATCH_64_DEF(27)
-LONGGRAB_MATCH_64_DEF(28)
-LONGGRAB_MATCH_64_DEF(29)
-LONGGRAB_MATCH_64_DEF(30)
-LONGGRAB_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 *(*longgrab_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-// skip the first three
-     0,
-     &longgrabMatch_32_1,
-     &longgrabMatch_32_2,
-     &longgrabMatch_32_3,
-     &longgrabMatch_32_4,
-     &longgrabMatch_32_5,
-     &longgrabMatch_32_6,
-     &longgrabMatch_32_7,
-     &longgrabMatch_32_8,
-     &longgrabMatch_32_9,
-     &longgrabMatch_32_10,
-     &longgrabMatch_32_11,
-     &longgrabMatch_32_12,
-     &longgrabMatch_32_13,
-     &longgrabMatch_32_14,
-     &longgrabMatch_32_15,
- };
-
-static
-const UNUSED u8 *(*longgrab_match_funcs_64[])(const u8 *buf, u64a z) =
-{
-// skip the first three
-    0,
-    &longgrabMatch_64_1,
-    &longgrabMatch_64_2,
-    &longgrabMatch_64_3,
-    &longgrabMatch_64_4,
-    &longgrabMatch_64_5,
-    &longgrabMatch_64_6,
-    &longgrabMatch_64_7,
-    &longgrabMatch_64_8,
-    &longgrabMatch_64_9,
-    &longgrabMatch_64_10,
-    &longgrabMatch_64_11,
-    &longgrabMatch_64_12,
-    &longgrabMatch_64_13,
-    &longgrabMatch_64_14,
-    &longgrabMatch_64_15,
-    &longgrabMatch_64_16,
-    &longgrabMatch_64_17,
-    &longgrabMatch_64_18,
-    &longgrabMatch_64_19,
-    &longgrabMatch_64_20,
-    &longgrabMatch_64_21,
-    &longgrabMatch_64_22,
-    &longgrabMatch_64_23,
-    &longgrabMatch_64_24,
-    &longgrabMatch_64_25,
-    &longgrabMatch_64_26,
-    &longgrabMatch_64_27,
-    &longgrabMatch_64_28,
-    &longgrabMatch_64_29,
-    &longgrabMatch_64_30,
-    &longgrabMatch_64_31,
-};
-
-#endif /* MULTIACCEL_LONGGRAB_H_ */
--- a/Show More
+++ b/Show More