diff --git a/CHANGELOG.md b/CHANGELOG.md
index 729e58ef..03eba64d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,8 +2,40 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
-## [4.3.2] 2016-11-15
+## [4.4.0] 2017-01-20
+- Introduce the "fat runtime" build. This will build several variants of the
+  Hyperscan scanning engine specialised for different processor feature sets,
+  and use the appropriate one for the host at runtime. This uses the "ifunc"
+  indirect function attribute provided by GCC and is currently available on
+  Linux only, where it is the default for release builds.
+- New API function: add the `hs_valid_platform()` function. This function tests
+  whether the host provides the SSSE3 instruction set required by Hyperscan.
+- Introduce a new standard benchmarking tool, "hsbench". This provides an easy
+  way to measure Hyperscan's performance for a particular set of patterns and
+  corpus of data to be scanned.
+- Introduce a 64-bit GPR LimEx NFA model, which uses 64-bit GPRs on 64-bit
+  hosts and SSE registers on 32-bit hosts.
+- Introduce a new DFA model ("McSheng") which is a hybrid of the existing
+  McClellan and Sheng models. This improves scanning performance for some
+  cases.
+- Introduce lookaround specialisations to improve scanning performance.
+- Improve the handling of long literals by moving confirmation to the Rose
+  interpreter and simplifying the hash table used to track them in streaming
+  mode.
+- Improve compile time optimisation for removing redundant paths from
+  expression graphs.
+- Build: improve support for building with MSVC toolchain.
+- Reduce the size of small write DFAs used for small scans in block mode.
+- Introduce a custom graph type (`ue2_graph`) used in place of the Boost Graph
+  Library's `adjacency_list` type. Improves compile time performance and type
+  safety.
+- Improve scanning performance of the McClellan DFA.
+- Bugfix for a very unusual SOM case where the incorrect start offset was
+  reported for a match.
+- Bugfix for issue #37, removing execute permissions from some source files.
+- Bugfix for issue #41, handle Windows line endings in pattern files.
 
+## [4.3.2] 2016-11-15
 - Bugfix for issue #39. This small change is a workaround for an issue in
   Boost 1.62. The fix has been submitted to Boost for inclusion in a future
   release.
@@ -11,7 +43,7 @@ This is a list of notable changes to Hyperscan, in reverse chronological order.
 ## [4.3.1] 2016-08-29
 - Bugfix for issue #30. In recent versions of Clang, a write to a variable was
   being elided, resulting in corrupted stream state after calling
-  hs_reset_stream().
+  `hs_reset_stream()`.
 
 ## [4.3.0] 2016-08-24
 - Introduce a new analysis pass ("Violet") used for decomposition of patterns
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 842834a1..3a7d40ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,11 @@
 cmake_minimum_required (VERSION 2.8.11)
-
-# don't use the built-in default configs
-set (CMAKE_NOT_USING_CONFIG_FLAGS TRUE)
-
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 3)
-set (HS_PATCH_VERSION 2)
+set (HS_MINOR_VERSION 4)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
-# since we are doing this manually, we only have three types
-set (CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo"
-     CACHE STRING "" FORCE)
-
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
@@ -70,7 +62,14 @@ include_directories(SYSTEM include)
 set(BOOST_USE_STATIC_LIBS OFF)
 set(BOOST_USE_MULTITHREADED OFF)
 set(BOOST_USE_STATIC_RUNTIME OFF)
-set(BOOST_MINVERSION 1.57.0)
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin"
+    OR (CMAKE_SYSTEM_NAME MATCHES "FreeBSD"
+        AND CMAKE_C_COMPILER_ID MATCHES "Clang"))
+    # we need a more recent boost for libc++ used by clang on OSX and FreeBSD
+    set(BOOST_MINVERSION 1.61.0)
+else ()
+    set(BOOST_MINVERSION 1.57.0)
+endif ()
 set(BOOST_NO_BOOST_CMAKE ON)
 
 # first check for Boost installed on the system
@@ -85,6 +84,7 @@ if(NOT Boost_FOUND)
     endif()
 endif()
 
+include (${CMAKE_MODULE_PATH}/boost.cmake)
 
 # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
 find_package(PythonInterp)
@@ -151,27 +151,21 @@ if(MSVC OR MSVC_IDE)
     if (MSVC_VERSION LESS 1700)
         message(FATAL_ERROR "The project requires C++11 features.")
     else()
-        # set base flags
-        set(CMAKE_C_FLAGS "/DWIN32 /D_WINDOWS /W3")
-        set(CMAKE_C_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
-        set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
-        set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
-
-        set(CMAKE_CXX_FLAGS "/DWIN32 /D_WINDOWS /W3 /GR /EHsc")
-        set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
-        set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
-
         if (WINDOWS_ICC)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         else()
             #TODO: don't hardcode arch
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /arch:AVX /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /arch:AVX /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         endif()
+        string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+        string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
 
-
+        if (DISABLE_ASSERTS)
+            set(CMAKE_C_FLAGS_DEBUG "/DNDEBUG ${CMAKE_C_FLAGS_DEBUG}")
+            set(CMAKE_CXX_FLAGS_DEBUG "/DNDEBUG ${CMAKE_CXX_FLAGS_DEBUG}")
+        endif ()
     endif()
 
 else()
@@ -192,6 +186,12 @@ else()
         unset(_GXX_OUTPUT)
     endif()
 
+    # remove CMake's idea of optimisation
+    foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
+        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
+        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+    endforeach ()
+
     if(OPTIMISE)
         set(OPT_C_FLAG "-O3")
         set(OPT_CXX_FLAG "-O2")
@@ -200,32 +200,28 @@ else()
         set(OPT_CXX_FLAG "-O0")
     endif(OPTIMISE)
 
-    # set up base flags for build types
-    set(CMAKE_C_FLAGS_DEBUG "-g ${OPT_C_FLAG} -Werror")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-g ${OPT_C_FLAG}")
-    set(CMAKE_C_FLAGS_RELEASE "${OPT_C_FLAG}")
+    # set compiler flags - more are tested and added later
+    set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+    set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
 
-    set(CMAKE_CXX_FLAGS_DEBUG "-g ${OPT_CXX_FLAG} -Werror")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g ${OPT_CXX_FLAG}")
-    set(CMAKE_CXX_FLAGS_RELEASE "${OPT_CXX_FLAG}")
+    if (NOT RELEASE_BUILD)
+        # -Werror is most useful during development, don't potentially break
+        # release builds
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
+    endif()
 
     if (DISABLE_ASSERTS)
-        # usually true for release builds, false for debug
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DNDEBUG")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
     endif()
 
-
-    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
-
     if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
-        message(STATUS "Building for current host CPU")
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -march=native -mtune=native")
+        set(ARCH_C_FLAGS "${ARCH_C_FLAGS} -march=native -mtune=native")
     endif()
+
     if (NOT CMAKE_CXX_FLAGS MATCHES .*march.*)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -march=native -mtune=native")
+        set(ARCH_CXX_FLAGS "${ARCH_CXX_FLAGS} -march=native -mtune=native")
     endif()
 
     if(CMAKE_COMPILER_IS_GNUCC)
@@ -242,12 +238,17 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
     endif()
 
+    if (RELEASE_BUILD)
+        # we don't need the noise of ABI warnings in a release build
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+    endif ()
+
 endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
 CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
 CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
-CHECK_INCLUDE_FILES(tmmintrin.h HAVE_TMMINTRIN_H)
 CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
 CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
 
@@ -267,9 +268,36 @@ if (RELEASE_BUILD)
     endif()
 endif()
 
-# ensure we are building for the right target arch
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    # This is a Linux-only feature for now - requires platform support
+    # elsewhere
+    message(STATUS "generator is ${CMAKE_GENERATOR}")
+    if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND
+        CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
+        message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime")
+        set (FAT_RUNTIME_REQUISITES FALSE)
+    elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
+            (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
+        message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
+        set (FAT_RUNTIME_REQUISITES FALSE)
+    else()
+        include (${CMAKE_MODULE_PATH}/attrib.cmake)
+        if (NOT HAS_C_ATTR_IFUNC)
+            message(STATUS "Compiler does not support ifunc attribute, cannot build fat runtime")
+            set (FAT_RUNTIME_REQUISITES FALSE)
+        else ()
+            set (FAT_RUNTIME_REQUISITES TRUE)
+        endif()
+    endif()
+    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitecures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
+endif ()
+
 include (${CMAKE_MODULE_PATH}/arch.cmake)
 
+if (NOT FAT_RUNTIME AND NOT HAVE_SSSE3)
+        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+endif ()
+
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@@ -375,6 +403,16 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
 endif()
 endif()
 
+if (NOT FAT_RUNTIME)
+message(STATUS "Building for current host CPU")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
+else()
+message(STATUS "Building runtime for multiple microarchitectures")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
 add_subdirectory(util)
 add_subdirectory(unit)
 add_subdirectory(doc/dev-reference)
@@ -401,8 +439,13 @@ if (NOT WIN32)
 endif()
 
 # only set these after all tests are done
+if (NOT FAT_RUNTIME)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+else()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+endif()
 
 
 if(NOT WIN32)
@@ -424,12 +467,21 @@ SET(hs_HEADERS
 )
 install(FILES ${hs_HEADERS} DESTINATION include/hs)
 
+set (hs_exec_common_SRCS
+    src/alloc.c
+    src/scratch.c
+    src/util/cpuid_flags.c
+    src/util/cpuid_flags.h
+    src/util/multibit.c
+    )
+
 set (hs_exec_SRCS
     ${hs_HEADERS}
     src/hs_version.h
     src/ue2common.h
-    src/alloc.c
     src/allocator.h
+    src/crc32.c
+    src/crc32.h
     src/report.h
     src/runtime.c
     src/fdr/fdr.c
@@ -437,7 +489,6 @@ set (hs_exec_SRCS
     src/fdr/fdr_internal.h
     src/fdr/fdr_confirm.h
     src/fdr/fdr_confirm_runtime.h
-    src/fdr/fdr_streaming_runtime.h
     src/fdr/flood_runtime.h
     src/fdr/fdr_loadval.h
     src/fdr/teddy.c
@@ -461,15 +512,12 @@ set (hs_exec_SRCS
     src/nfa/lbr.h
     src/nfa/lbr_common_impl.h
     src/nfa/lbr_internal.h
-    src/nfa/mcclellan.c
-    src/nfa/mcclellan.h
-    src/nfa/mcclellan_common_impl.h
-    src/nfa/mcclellan_internal.h
     src/nfa/limex_accel.c
     src/nfa/limex_accel.h
     src/nfa/limex_exceptional.h
     src/nfa/limex_native.c
     src/nfa/limex_ring.h
+    src/nfa/limex_64.c
     src/nfa/limex_simd128.c
     src/nfa/limex_simd256.c
     src/nfa/limex_simd384.c
@@ -482,6 +530,14 @@ set (hs_exec_SRCS
     src/nfa/limex_runtime_impl.h
     src/nfa/limex_shuffle.h
     src/nfa/limex_state_impl.h
+    src/nfa/mcclellan.c
+    src/nfa/mcclellan.h
+    src/nfa/mcclellan_common_impl.h
+    src/nfa/mcclellan_internal.h
+    src/nfa/mcsheng.c
+    src/nfa/mcsheng_data.c
+    src/nfa/mcsheng.h
+    src/nfa/mcsheng_internal.h
     src/nfa/mpv.h
     src/nfa/mpv.c
     src/nfa/mpv_internal.h
@@ -542,6 +598,8 @@ set (hs_exec_SRCS
     src/rose/init.h
     src/rose/init.c
     src/rose/stream.c
+    src/rose/stream_long_lit.h
+    src/rose/stream_long_lit_hash.h
     src/rose/match.h
     src/rose/match.c
     src/rose/miracle.h
@@ -554,15 +612,16 @@ set (hs_exec_SRCS
     src/rose/rose_types.h
     src/rose/rose_common.h
     src/rose/validate_mask.h
+    src/rose/validate_shufti.h
     src/util/bitutils.h
+    src/util/copybytes.h
     src/util/exhaust.h
     src/util/fatbit.h
-    src/util/fatbit.c
     src/util/join.h
     src/util/masked_move.h
     src/util/multibit.h
-    src/util/multibit_internal.h
     src/util/multibit.c
+    src/util/multibit_internal.h
     src/util/pack_bits.h
     src/util/popcount.h
     src/util/pqueue.h
@@ -574,21 +633,14 @@ set (hs_exec_SRCS
     src/util/state_compress.c
     src/util/unaligned.h
     src/util/uniform_ops.h
-    src/scratch.h
-    src/scratch.c
-    src/crc32.c
-    src/crc32.h
     src/database.c
     src/database.h
 )
 
-if (HAVE_AVX2)
-    set (hs_exec_SRCS
-        ${hs_exec_SRCS}
-        src/fdr/teddy_avx2.c
-        src/util/masked_move.c
-        )
-endif ()
+set (hs_exec_avx2_SRCS
+    src/fdr/teddy_avx2.c
+    src/util/masked_move.c
+)
 
 
 SET (hs_SRCS
@@ -621,8 +673,6 @@ SET (hs_SRCS
     src/fdr/fdr_engine_description.cpp
     src/fdr/fdr_engine_description.h
     src/fdr/fdr_internal.h
-    src/fdr/fdr_streaming_compile.cpp
-    src/fdr/fdr_streaming_internal.h
     src/fdr/flood_compile.cpp
     src/fdr/teddy_compile.cpp
     src/fdr/teddy_compile.h
@@ -660,6 +710,8 @@ SET (hs_SRCS
     src/nfa/mcclellancompile.h
     src/nfa/mcclellancompile_util.cpp
     src/nfa/mcclellancompile_util.h
+    src/nfa/mcsheng_compile.cpp
+    src/nfa/mcsheng_compile.h
     src/nfa/limex_compile.cpp
     src/nfa/limex_compile.h
     src/nfa/limex_accel.h
@@ -677,6 +729,8 @@ SET (hs_SRCS
     src/nfa/nfa_internal.h
     src/nfa/nfa_kind.h
     src/nfa/rdfa.h
+    src/nfa/rdfa_graph.cpp
+    src/nfa/rdfa_graph.h
     src/nfa/rdfa_merge.cpp
     src/nfa/rdfa_merge.h
     src/nfa/repeat_internal.h
@@ -721,7 +775,6 @@ SET (hs_SRCS
     src/nfagraph/ng_extparam.h
     src/nfagraph/ng_fixed_width.cpp
     src/nfagraph/ng_fixed_width.h
-    src/nfagraph/ng_graph.h
     src/nfagraph/ng_haig.cpp
     src/nfagraph/ng_haig.h
     src/nfagraph/ng_holder.cpp
@@ -875,6 +928,7 @@ SET (hs_SRCS
     src/rose/rose_build_compile.cpp
     src/rose/rose_build_convert.cpp
     src/rose/rose_build_convert.h
+    src/rose/rose_build_engine_blob.h
     src/rose/rose_build_exclusive.cpp
     src/rose/rose_build_exclusive.h
     src/rose/rose_build_groups.cpp
@@ -882,6 +936,8 @@ SET (hs_SRCS
     src/rose/rose_build_impl.h
     src/rose/rose_build_infix.cpp
     src/rose/rose_build_infix.h
+    src/rose/rose_build_long_lit.cpp
+    src/rose/rose_build_long_lit.h
     src/rose/rose_build_lookaround.cpp
     src/rose/rose_build_lookaround.h
     src/rose/rose_build_matchers.cpp
@@ -889,6 +945,8 @@ SET (hs_SRCS
     src/rose/rose_build_merge.cpp
     src/rose/rose_build_merge.h
     src/rose/rose_build_misc.cpp
+    src/rose/rose_build_program.cpp
+    src/rose/rose_build_program.h
     src/rose/rose_build_role_aliasing.cpp
     src/rose/rose_build_scatter.cpp
     src/rose/rose_build_scatter.h
@@ -915,14 +973,15 @@ SET (hs_SRCS
     src/util/compile_error.cpp
     src/util/compile_error.h
     src/util/container.h
-    src/util/cpuid_flags.c
-    src/util/cpuid_flags.h
     src/util/depth.cpp
     src/util/depth.h
     src/util/determinise.h
     src/util/dump_mask.cpp
     src/util/dump_mask.h
+    src/util/fatbit_build.cpp
+    src/util/fatbit_build.h
     src/util/graph.h
+    src/util/hash.h
     src/util/multibit_build.cpp
     src/util/multibit_build.h
     src/util/order_check.h
@@ -937,6 +996,7 @@ SET (hs_SRCS
     src/util/target_info.cpp
     src/util/target_info.h
     src/util/ue2_containers.h
+    src/util/ue2_graph.h
     src/util/ue2string.cpp
     src/util/ue2string.h
     src/util/unaligned.h
@@ -966,6 +1026,8 @@ set(hs_dump_SRCS
     src/nfa/limex_dump.cpp
     src/nfa/mcclellandump.cpp
     src/nfa/mcclellandump.h
+    src/nfa/mcsheng_dump.cpp
+    src/nfa/mcsheng_dump.h
     src/nfa/mpv_dump.cpp
     src/nfa/nfa_dump_api.h
     src/nfa/nfa_dump_dispatch.cpp
@@ -990,6 +1052,8 @@ set(hs_dump_SRCS
     src/rose/rose_dump.h
     src/util/dump_charclass.cpp
     src/util/dump_charclass.h
+    src/util/dump_util.cpp
+    src/util/dump_util.h
 )
 
 if (DUMP_SUPPORT)
@@ -1002,27 +1066,106 @@ endif()
 set (LIB_VERSION ${HS_VERSION})
 set (LIB_SOVERSION ${HS_MAJOR_VERSION})
 
-add_library(hs_exec OBJECT ${hs_exec_SRCS})
+if (NOT FAT_RUNTIME)
+
+    set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_common_SRCS})
+
+    if (HAVE_AVX2)
+        set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+    endif()
+
+    add_library(hs_exec OBJECT ${hs_exec_SRCS})
+
+    add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+    set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+
+    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+        add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
+        set_target_properties(hs_exec_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+    endif()
+
+else (FAT_RUNTIME)
+    set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
+    add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+    set_target_properties(hs_exec_core2 PROPERTIES
+        COMPILE_FLAGS "-march=core2"
+        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+        )
+
+    add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
+    set_target_properties(hs_exec_corei7 PROPERTIES
+        COMPILE_FLAGS "-march=corei7"
+        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+        )
+
+    add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+    set_target_properties(hs_exec_avx2 PROPERTIES
+        COMPILE_FLAGS "-march=core-avx2"
+        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+        )
+
+    add_library(hs_exec_common OBJECT
+        ${hs_exec_common_SRCS}
+        src/dispatcher.c
+        )
+    set_source_files_properties(src/dispatcher.c PROPERTIES
+        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function")
+
+    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+        add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+        set_target_properties(hs_exec_shared_core2 PROPERTIES
+            COMPILE_FLAGS "-march=core2"
+            POSITION_INDEPENDENT_CODE TRUE
+            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+            )
+        add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
+        set_target_properties(hs_exec_shared_corei7 PROPERTIES
+            COMPILE_FLAGS "-march=corei7"
+            POSITION_INDEPENDENT_CODE TRUE
+            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+            )
+        add_library(hs_exec_shared_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+        set_target_properties(hs_exec_shared_avx2 PROPERTIES
+            COMPILE_FLAGS "-march=core-avx2"
+            POSITION_INDEPENDENT_CODE TRUE
+            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+            )
+        add_library(hs_exec_common_shared OBJECT
+        ${hs_exec_common_SRCS}
+        src/dispatcher.c
+        )
+        set_target_properties(hs_exec_common_shared PROPERTIES
+            OUTPUT_NAME hs_exec_common
+            POSITION_INDEPENDENT_CODE TRUE)
+    endif() # SHARED
 
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
-set_target_properties(hs_exec_shared PROPERTIES
-    POSITION_INDEPENDENT_CODE TRUE)
-endif()
 
 # hs_version.c is added explicitly to avoid some build systems that refuse to
 # create a lib without any src (I'm looking at you Xcode)
 
-add_library(hs_runtime STATIC src/hs_version.c $<TARGET_OBJECTS:hs_exec>)
+    add_library(hs_runtime STATIC src/hs_version.c
+        $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
+        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
+endif (NOT FAT_RUNTIME)
 
-set_target_properties(hs_runtime PROPERTIES
-    LINKER_LANGUAGE C)
+
+set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 if (NOT BUILD_SHARED_LIBS)
     install(TARGETS hs_runtime DESTINATION lib)
 endif()
 
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-    add_library(hs_runtime_shared SHARED src/hs_version.c $<TARGET_OBJECTS:hs_exec_shared>)
+    if (NOT FAT_RUNTIME)
+        add_library(hs_runtime_shared SHARED src/hs_version.c src/hs_valid_platform.c
+$<TARGET_OBJECTS:hs_exec_shared>)
+            else()
+        add_library(hs_runtime_shared SHARED src/hs_version.c
+            src/hs_valid_platform.c
+            $<TARGET_OBJECTS:hs_exec_common_shared>
+            $<TARGET_OBJECTS:hs_exec_shared_core2>
+            $<TARGET_OBJECTS:hs_exec_shared_corei7>
+            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+    endif()
     set_target_properties(hs_runtime_shared PROPERTIES
         VERSION ${LIB_VERSION}
         SOVERSION ${LIB_SOVERSION}
@@ -1035,8 +1178,14 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
         LIBRARY DESTINATION lib)
 endif()
 
-# we want the static lib for testing
-add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
+if (NOT FAT_RUNTIME)
+    add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+else()
+    # we want the static lib for testing
+    add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
+        ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
+        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
+endif()
 
 add_dependencies(hs ragel_Parser)
 
@@ -1045,7 +1194,17 @@ install(TARGETS hs DESTINATION lib)
 endif()
 
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-    add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
+    if (NOT FAT_RUNTIME)
+        add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
+            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
+    else()
+        add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
+            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common_shared>
+            $<TARGET_OBJECTS:hs_exec_shared_core2>
+            $<TARGET_OBJECTS:hs_exec_shared_corei7>
+            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+
+    endif()
     add_dependencies(hs_shared ragel_Parser)
     set_target_properties(hs_shared PROPERTIES
         OUTPUT_NAME hs
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index c00401dd..e98fbf22 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -11,7 +11,8 @@ else ()
 endif ()
 
 
-set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
+set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
+
 # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
@@ -19,10 +20,6 @@ int main() {
     (void)_mm_shuffle_epi8(a, a);
 }" HAVE_SSSE3)
 
-if (NOT HAVE_SSSE3)
-    message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
-endif ()
-
 # now look for AVX2
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 #if !defined(__AVX2__)
@@ -34,9 +31,5 @@ int main(){
     (void)_mm256_xor_si256(z, z);
 }" HAVE_AVX2)
 
-if (NOT HAVE_AVX2)
-    message(STATUS "Building without AVX2 support")
-endif ()
-
 unset (CMAKE_REQUIRED_FLAGS)
 unset (INTRIN_INC_H)
diff --git a/cmake/attrib.cmake b/cmake/attrib.cmake
new file mode 100644
index 00000000..5600ce6b
--- /dev/null
+++ b/cmake/attrib.cmake
@@ -0,0 +1,13 @@
+# tests for compiler properties
+
+# set -Werror so we can't ignore unused attribute warnings
+set (CMAKE_REQUIRED_FLAGS "-Werror")
+
+CHECK_C_SOURCE_COMPILES("
+    int foo(int) __attribute__ ((ifunc(\"foo_i\")));
+    int f1(int i) { return i; }
+    void (*foo_i()) { return f1; }
+    int main(void) { return 0; }
+    " HAS_C_ATTR_IFUNC)
+
+unset(CMAKE_REQUIRED_FLAGS)
diff --git a/cmake/boost.cmake b/cmake/boost.cmake
new file mode 100644
index 00000000..3d513deb
--- /dev/null
+++ b/cmake/boost.cmake
@@ -0,0 +1,41 @@
+# Boost 1.62 has a bug that we've patched around, check if it is required
+if (Boost_VERSION EQUAL 106200)
+    set (CMAKE_REQUIRED_INCLUDES ${BOOST_INCLUDEDIR} "${PROJECT_SOURCE_DIR}/include")
+    set (BOOST_REV_TEST "
+#include <boost/graph/graph_concepts.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/concept/assert.hpp>
+
+int main(int,char*[])
+{
+  using namespace boost;
+  // Check const reverse_graph
+  {
+    typedef adjacency_list< vecS, vecS, bidirectionalS,
+      property<vertex_color_t, int>,
+      property<edge_weight_t, int>,
+      property<graph_name_t, std::string>
+    > AdjList;
+    typedef reverse_graph<AdjList> Graph;
+    BOOST_CONCEPT_ASSERT(( BidirectionalGraphConcept<Graph> ));
+  }
+  return 0;
+}
+")
+
+    CHECK_CXX_SOURCE_COMPILES("${BOOST_REV_TEST}" BOOST_REVGRAPH_OK)
+
+    if (NOT BOOST_REVGRAPH_OK)
+        message(STATUS "trying patched")
+        CHECK_CXX_SOURCE_COMPILES("
+#include <boost-patched/graph/reverse_graph.hpp>
+${BOOST_REV_TEST}" BOOST_REVGRAPH_PATCH)
+    endif()
+
+    if (NOT BOOST_REVGRAPH_OK AND NOT BOOST_REVGRAPH_PATCH)
+        message(FATAL_ERROR "Something is wrong with this copy of boost::reverse_graph")
+    endif()
+
+    unset (CMAKE_REQUIRED_INCLUDES)
+endif () # Boost 1.62.0
diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh
new file mode 100755
index 00000000..5baf209b
--- /dev/null
+++ b/cmake/build_wrapper.sh
@@ -0,0 +1,27 @@
+#!/bin/sh -e
+# This is used for renaming symbols for the fat runtime, don't call directly
+# TODO: make this a lot less fragile!
+PREFIX=$1
+KEEPSYMS_IN=$2
+shift 2
+BUILD=$@
+OUT=$(echo $BUILD | sed 's/.* -o \(.*\.o\).*/\1/')
+SYMSFILE=/tmp/${PREFIX}_rename.syms.$$
+KEEPSYMS=/tmp/keep.syms.$$
+# grab the command without the target obj or src file flags
+# we don't just call gcc directly as there may be flags modifying the arch
+CC_CMD=$(echo $BUILD | sed 's/ -o .*\.o//;s/ -c //;s/ .[^ ]*\.c//;')
+# find me a libc
+LIBC_SO=$(${CC_CMD} --print-file-name=libc.so.6)
+cp ${KEEPSYMS_IN} ${KEEPSYMS}
+# get all symbols from libc and turn them into patterns
+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
+# build the object
+${BUILD}
+# rename the symbols in the object
+nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
+if test -s ${SYMSFILE}
+then
+    objcopy --redefine-syms=${SYMSFILE} ${OUT}
+fi
+rm -f ${SYMSFILE} ${KEEPSYMS}
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 75c27b3e..c7b577c2 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -15,6 +15,9 @@
 /* internal build, switch on dump support. */
 #cmakedefine DUMP_SUPPORT
 
+/* Define if building "fat" runtime. */
+#cmakedefine FAT_RUNTIME
+
 /* Define to 1 if `backtrace' works. */
 #cmakedefine HAVE_BACKTRACE
 
@@ -67,9 +70,6 @@
 /* Define if the sqlite3_open_v2 call is available */
 #cmakedefine HAVE_SQLITE3_OPEN_V2
 
-/* Define to 1 if you have the <tmmintrin.h> header file. */
-#cmakedefine HAVE_TMMINTRIN_H
-
 /* Define to 1 if you have the <unistd.h> header file. */
 #cmakedefine HAVE_UNISTD_H
 
@@ -89,3 +89,5 @@
 /* define if this is a release build. */
 #cmakedefine RELEASE_BUILD
 
+/* define if reverse_graph requires patch for boost 1.62.0 */
+#cmakedefine BOOST_REVGRAPH_PATCH
diff --git a/cmake/keep.syms.in b/cmake/keep.syms.in
new file mode 100644
index 00000000..ab6f82a5
--- /dev/null
+++ b/cmake/keep.syms.in
@@ -0,0 +1,11 @@
+# names to exclude
+hs_misc_alloc
+hs_misc_free
+hs_free_scratch
+hs_stream_alloc
+hs_stream_free
+hs_scratch_alloc
+hs_scratch_free
+hs_database_alloc
+hs_database_free
+^_
diff --git a/cmake/sqlite3.cmake b/cmake/sqlite3.cmake
new file mode 100644
index 00000000..c07f1161
--- /dev/null
+++ b/cmake/sqlite3.cmake
@@ -0,0 +1,53 @@
+#
+# a lot of noise to find sqlite
+#
+
+option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
+
+if(NOT WIN32 AND NOT SQLITE_PREFER_STATIC)
+find_package(PkgConfig QUIET)
+
+# first check for sqlite on the system
+pkg_check_modules(SQLITE3 sqlite3)
+endif()
+
+if (NOT SQLITE3_FOUND)
+    message(STATUS "looking for sqlite3 in source tree")
+    # look in the source tree
+    if (EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.h" AND
+            EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
+        message(STATUS "  found sqlite3 in source tree")
+        set(SQLITE3_FOUND TRUE)
+        set(SQLITE3_BUILD_SOURCE TRUE)
+        set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
+        set(SQLITE3_LDFLAGS sqlite3_static)
+    else()
+        message(FATAL_ERROR "  no sqlite3 in source tree")
+    endif()
+endif()
+
+# now do version checks
+if (SQLITE3_FOUND)
+    list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}")
+    CHECK_C_SOURCE_COMPILES("#include <sqlite3.h>\n#if SQLITE_VERSION_NUMBER >= 3008007 && SQLITE_VERSION_NUMBER < 3008010\n#error broken sqlite\n#endif\nint main() {return 0;}" SQLITE_VERSION_OK)
+    if (NOT SQLITE_VERSION_OK)
+        message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
+    endif()
+if (NOT SQLITE3_BUILD_SOURCE)
+    set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
+    CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
+    list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
+    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
+else()
+    if (NOT TARGET sqlite3_static)
+    # build sqlite as a static lib to compile into our test programs
+    add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
+    if (NOT WIN32)
+        set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
+    endif()
+    endif()
+endif()
+endif()
+
+# that's enough about sqlite
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index 826349a7..1794f3e9 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -169,6 +169,9 @@ Common options for CMake include:
 +------------------------+----------------------------------------------------+
 | DEBUG_OUTPUT           | Enable very verbose debug output. Default off.     |
 +------------------------+----------------------------------------------------+
+| FAT_RUNTIME            | Build the :ref:`fat runtime<fat_runtime>`. Default |
+|                        | true on Linux, not available elsewhere.            |
++------------------------+----------------------------------------------------+
 
 For example, to generate a ``Debug`` build: ::
 
@@ -199,11 +202,11 @@ The other types of builds are:
 Target Architecture
 -------------------
 
-By default, Hyperscan will be compiled to target the instruction set of the
-processor of the machine that being used for compilation. This is done via
-the use of ``-march=native``. The result of this means that a library built on
-one machine may not work on a different machine if they differ in supported
-instruction subsets.
+Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
+compiled to target the instruction set of the processor of the machine that
+being used for compilation. This is done via the use of ``-march=native``. The
+result of this means that a library built on one machine may not work on a
+different machine if they differ in supported instruction subsets.
 
 To override the use of ``-march=native``, set appropriate flags for the
 compiler in ``CFLAGS`` and ``CXXFLAGS`` environment variables before invoking
@@ -215,3 +218,57 @@ example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
 
 For more information, refer to :ref:`instr_specialization`.
 
+.. _fat_runtime:
+
+Fat Runtime
+-----------
+
+A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
+library to dispatch the most appropriate runtime code for the host processor.
+This feature is called the "fat runtime", as a single Hyperscan library
+contains multiple copies of the runtime code for different instruction sets.
+
+.. note::
+
+    The fat runtime feature is only available on Linux. Release builds of
+    Hyperscan will default to having the fat runtime enabled where supported.
+
+When building the library with the fat runtime, the Hyperscan runtime code
+will be compiled multiple times for these different instruction sets, and
+these compiled objects are combined into one library. There are no changes to
+how user applications are built against this library.
+
+When applications are executed, the correct version of the runtime is selected
+for the machine that it is running on. This is done using a ``CPUID`` check
+for the presence of the instruction set, and then an indirect function is
+resolved so that the right version of each API function is used. There is no
+impact on function call performance, as this check and resolution is performed
+by the ELF loader once when the binary is loaded.
+
+If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
+API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
+instead of potentially executing illegal instructions. The API function
+:c:func:`hs_valid_platform` can be used by application writers to determine if
+the current platform is supported by Hyperscan.
+
+At of this release, the variants of the runtime that are built, and the CPU
+capability that is required, are the following:
+
++----------+-------------------------------+---------------------+
+| Variant  | CPU Feature Flag(s) Required  | gcc arch flag       |
++==========+===============================+=====================+
+| Core 2   | ``SSSE3``                     | ``-march=core2``    |
++----------+-------------------------------+---------------------+
+| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``   |
++----------+-------------------------------+---------------------+
+| AVX 2    | ``AVX2``                      | ``-march=avx2``     |
++----------+-------------------------------+---------------------+
+
+As this requires compiler, libc, and binutils support, at this time the fat
+runtime will only be enabled for Linux builds where the compiler supports the
+`indirect function "ifunc" function attribute
+<https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-indirect-functions-3321>`_.
+
+This attribute should be available on all supported versions of GCC, and
+recent versions of Clang and ICC. There is currently no operating system
+support for this feature on non-Linux systems.
diff --git a/examples/patbench.cc b/examples/patbench.cc
index 9c2b41fa..f82f47a7 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -833,6 +833,8 @@ static unsigned parseFlags(const string &flagsStr) {
             flags |= HS_FLAG_UTF8; break;
         case 'W':
             flags |= HS_FLAG_UCP; break;
+        case '\r': // stray carriage-return
+            break;
         default:
             cerr << "Unsupported flag \'" << c << "\'" << endl;
             exit(-1);
diff --git a/examples/pcapscan.cc b/examples/pcapscan.cc
index 032b19cd..12b94438 100644
--- a/examples/pcapscan.cc
+++ b/examples/pcapscan.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -621,6 +621,8 @@ static unsigned parseFlags(const string &flagsStr) {
             flags |= HS_FLAG_UTF8; break;
         case 'W':
             flags |= HS_FLAG_UCP; break;
+        case '\r': // stray carriage-return
+            break;
         default:
             cerr << "Unsupported flag \'" << c << "\'" << endl;
             exit(-1);
diff --git a/include/boost-patched/graph/reverse_graph.hpp b/include/boost-patched/graph/reverse_graph.hpp
index 07a11f9b..8f98a1d5 100644
--- a/include/boost-patched/graph/reverse_graph.hpp
+++ b/include/boost-patched/graph/reverse_graph.hpp
@@ -5,7 +5,7 @@
 
 #include <boost/graph/reverse_graph.hpp>
 
-#if (BOOST_VERSION == 106200)
+#if defined(BOOST_REVGRAPH_PATCH)
 
 // Boost 1.62.0 does not implement degree() in reverse_graph which is required
 // by BidirectionalGraph, so add it.
diff --git a/src/compiler/asserts.cpp b/src/compiler/asserts.cpp
index 0365e268..be836b06 100644
--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -117,11 +117,11 @@ typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
 static
 void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
                          u32 &assert_edge_count) {
-    DEBUG_PRINTF("replacing assert vertex %u\n", g[t].index);
+    DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);
 
     const u32 flags = g[t].assert_flags;
-    DEBUG_PRINTF("consider assert vertex %u with flags %u\n",
-                 g[t].index, flags);
+    DEBUG_PRINTF("consider assert vertex %zu with flags %u\n", g[t].index,
+                 flags);
 
     // Wire up all the predecessors to all the successors.
 
@@ -142,7 +142,7 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
         for (const auto &outEdge : out_edges_range(t, g)) {
             NFAVertex v = target(outEdge, g);
 
-            DEBUG_PRINTF("consider path [%u,%u,%u]\n", g[u].index,
+            DEBUG_PRINTF("consider path [%zu,%zu,%zu]\n", g[u].index,
                          g[t].index, g[v].index);
 
             if (v == t) {
@@ -173,9 +173,8 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
             auto cache_key = make_pair(u, v);
             auto ecit = edge_cache.find(cache_key);
             if (ecit == edge_cache.end()) {
-                DEBUG_PRINTF("adding edge %u %u\n", g[u].index,
-                              g[v].index);
-                NFAEdge e = add_edge(u, v, g).first;
+                DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
+                NFAEdge e = add_edge(u, v, g);
                 edge_cache.emplace(cache_key, e);
                 g[e].assert_flags = flags;
                 if (++assert_edge_count > MAX_ASSERT_EDGES) {
@@ -184,7 +183,7 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
                 }
             } else {
                 NFAEdge e = ecit->second;
-                DEBUG_PRINTF("updating edge %u %u [a %u]\n", g[u].index,
+                DEBUG_PRINTF("updating edge %zu %zu [a %zu]\n", g[u].index,
                              g[v].index, g[t].index);
                 // Edge already exists.
                 u32 &e_flags = g[e].assert_flags;
@@ -211,8 +210,7 @@ void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
     Report r = rm.getBasicInternalReport(g, adj);
 
     g[v].reports.insert(rm.getInternalId(r));
-    DEBUG_PRINTF("set report id for vertex %u, adj %d\n",
-                 g[v].index, adj);
+    DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
 }
 
 static
@@ -222,8 +220,7 @@ void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
         if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
             continue;
         }
-        DEBUG_PRINTF("mls %u %08x\n", g[v].index,
-                     g[v].assert_flags);
+        DEBUG_PRINTF("mls %zu %08x\n", g[v].index, g[v].assert_flags);
 
         /* we have found a multi-line start (maybe more than one) */
 
@@ -299,8 +296,8 @@ void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
         DEBUG_PRINTF("resolved %zu assert vertices\n", num);
         pruneUseless(g);
         pruneEmptyVertices(g);
-        g.renumberVertices();
-        g.renumberEdges();
+        renumber_vertices(g);
+        renumber_edges(g);
     }
 
     DEBUG_PRINTF("after: graph has %zu vertices\n", num_vertices(g));
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index d56aff88..4a4afc64 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -29,8 +29,10 @@
 /** \file
  * \brief Compiler front-end interface.
  */
+#include "allocator.h"
 #include "asserts.h"
 #include "compiler.h"
+#include "crc32.h"
 #include "database.h"
 #include "grey.h"
 #include "hs_internal.h"
@@ -321,6 +323,45 @@ platform_t target_to_platform(const target_t &target_info) {
     return p;
 }
 
+/** \brief Encapsulate the given bytecode (RoseEngine) in a newly-allocated
+ * \ref hs_database, ensuring that it is padded correctly to give cacheline
+ * alignment.  */
+static
+hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
+    size_t db_len = sizeof(struct hs_database) + len;
+    DEBUG_PRINTF("db size %zu\n", db_len);
+    DEBUG_PRINTF("db platform %llx\n", platform);
+
+    struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
+    if (hs_check_alloc(db) != HS_SUCCESS) {
+        hs_database_free(db);
+        return nullptr;
+    }
+
+    // So that none of our database is uninitialized
+    memset(db, 0, db_len);
+
+    // we need to align things manually
+    size_t shift = (uintptr_t)db->bytes & 0x3f;
+    DEBUG_PRINTF("shift is %zu\n", shift);
+
+    db->bytecode = offsetof(struct hs_database, bytes) - shift;
+    char *bytecode = (char *)db + db->bytecode;
+    assert(ISALIGNED_CL(bytecode));
+
+    db->magic = HS_DB_MAGIC;
+    db->version = HS_DB_VERSION;
+    db->length = len;
+    db->platform = platform;
+
+    // Copy bytecode
+    memcpy(bytecode, in_bytecode, len);
+
+    db->crc32 = Crc32c_ComputeBuf(0, bytecode, db->length);
+    return db;
+}
+
+
 struct hs_database *build(NG &ng, unsigned int *length) {
     assert(length);
 
diff --git a/src/database.c b/src/database.c
index a4e10c22..61eb021f 100644
--- a/src/database.c
+++ b/src/database.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -348,43 +348,6 @@ hs_error_t dbIsValid(const hs_database_t *db) {
     return HS_SUCCESS;
 }
 
-/** \brief Encapsulate the given bytecode (RoseEngine) in a newly-allocated
- * \ref hs_database, ensuring that it is padded correctly to give cacheline
- * alignment.  */
-hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
-    size_t db_len = sizeof(struct hs_database) + len;
-    DEBUG_PRINTF("db size %zu\n", db_len);
-    DEBUG_PRINTF("db platform %llx\n", platform);
-
-    struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
-    if (hs_check_alloc(db) != HS_SUCCESS) {
-        hs_database_free(db);
-        return NULL;
-    }
-
-    // So that none of our database is uninitialized
-    memset(db, 0, db_len);
-
-    // we need to align things manually
-    size_t shift = (uintptr_t)db->bytes & 0x3f;
-    DEBUG_PRINTF("shift is %zu\n", shift);
-
-    db->bytecode = offsetof(struct hs_database, bytes) - shift;
-    char *bytecode = (char *)db + db->bytecode;
-    assert(ISALIGNED_CL(bytecode));
-
-    db->magic = HS_DB_MAGIC;
-    db->version = HS_DB_VERSION;
-    db->length = len;
-    db->platform = platform;
-
-    // Copy bytecode
-    memcpy(bytecode, in_bytecode, len);
-
-    db->crc32 = Crc32c_ComputeBuf(0, bytecode, db->length);
-    return db;
-}
-
 #if defined(_WIN32)
 #define SNPRINTF_COMPAT _snprintf
 #else
diff --git a/src/database.h b/src/database.h
index 5488c93d..399513fc 100644
--- a/src/database.h
+++ b/src/database.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -110,7 +110,6 @@ hs_error_t validDatabase(const hs_database_t *db) {
 }
 
 hs_error_t dbIsValid(const struct hs_database *db);
-struct hs_database *dbCreate(const char *bytecode, size_t len, u64a platform);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/dispatcher.c b/src/dispatcher.c
new file mode 100644
index 00000000..fb2f4f02
--- /dev/null
+++ b/src/dispatcher.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "hs_common.h"
+#include "hs_runtime.h"
+#include "ue2common.h"
+#include "util/cpuid_flags.h"
+#include "util/join.h"
+
+#define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
+    /* create defns */                                                         \
+    RTYPE JOIN(avx2_, NAME)(__VA_ARGS__);                                      \
+    RTYPE JOIN(corei7_, NAME)(__VA_ARGS__);                                    \
+    RTYPE JOIN(core2_, NAME)(__VA_ARGS__);                                     \
+                                                                               \
+    /* error func */                                                           \
+    static inline RTYPE JOIN(error_, NAME)(__VA_ARGS__) {                      \
+        return (RTYPE)HS_ARCH_ERROR;                                           \
+    }                                                                          \
+                                                                               \
+    /* resolver */                                                             \
+    static void(*JOIN(resolve_, NAME)(void)) {                                 \
+        if (check_avx2()) {                                                    \
+            return JOIN(avx2_, NAME);                                          \
+        }                                                                      \
+        if (check_sse42() && check_popcnt()) {                                 \
+            return JOIN(corei7_, NAME);                                        \
+        }                                                                      \
+        if (check_ssse3()) {                                                   \
+            return JOIN(core2_, NAME);                                         \
+        }                                                                      \
+        /* anything else is fail */                                            \
+        return JOIN(error_, NAME);                                             \
+    }                                                                          \
+                                                                               \
+    /* function */                                                             \
+    HS_PUBLIC_API                                                              \
+    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+
+CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
+                unsigned length, unsigned flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *userCtx);
+
+CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
+                size_t *stream_size);
+
+CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
+                size_t *size);
+CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
+CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
+
+CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
+                unsigned int flags, hs_stream_t **stream);
+
+CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
+                unsigned int length, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *ctxt);
+
+CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
+                hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
+
+CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
+                const char *const *data, const unsigned int *length,
+                unsigned int count, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onevent, void *context);
+
+CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
+
+CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
+                const hs_stream_t *from_id);
+
+CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
+                unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+
+CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
+                const hs_stream_t *from_id, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+
+CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
+                char **bytes, size_t *length);
+
+CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
+                const size_t length, hs_database_t **db);
+
+CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
+                const size_t length, hs_database_t *db);
+
+CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
+                size_t length, char **info);
+
+CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
+                const size_t length, size_t *deserialized_size);
+
+/** INTERNALS **/
+
+CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 4230c2b1..23416c70 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -31,7 +31,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
-#include "fdr_streaming_runtime.h"
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
@@ -809,8 +808,6 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         len,
         hbuf,
         0,
-        hbuf, // nocase
-        0,
         start,
         cb,
         ctxt,
@@ -828,14 +825,12 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
                               size_t start, HWLMCallback cb, void *ctxt,
-                              hwlm_group_t groups, u8 *stream_state) {
+                              hwlm_group_t groups) {
     struct FDR_Runtime_Args a = {
         buf,
         len,
         hbuf,
         hlen,
-        hbuf, // nocase - start same as caseful, override later if needed
-        hlen, // nocase
         start,
         cb,
         ctxt,
@@ -844,7 +839,6 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
          * the history buffer (they may be garbage). */
         hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
     };
-    fdrUnpackState(fdr, &a, stream_state);
 
     hwlm_error_t ret;
     if (unlikely(a.start_offset >= a.len)) {
@@ -854,6 +848,5 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         ret = funcs[fdr->engineID](fdr, &a, groups);
     }
 
-    fdrPackState(fdr, &a, stream_state);
     return ret;
 }
diff --git a/src/fdr/fdr.h b/src/fdr/fdr.h
index e0aa594f..e2b80056 100644
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -43,10 +43,6 @@ extern "C" {
 
 struct FDR;
 
-/** \brief Returns non-zero if the contents of the stream state indicate that
- * there is active FDR history beyond the regularly used history. */
-u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state);
-
 /**
  * \brief Block-mode scan.
  *
@@ -74,12 +70,11 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
  * \param cb Callback to call when a match is found.
  * \param ctxt Caller-provided context pointer supplied to callback on match.
  * \param groups Initial groups mask.
- * \param stream_state Persistent stream state for use by FDR.
  */
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
                               size_t start, HWLMCallback cb, void *ctxt,
-                              hwlm_group_t groups, u8 *stream_state);
+                              hwlm_group_t groups);
 
 #ifdef __cplusplus
 }
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 89a0ff72..937513a8 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -39,6 +39,7 @@
 #include "teddy_engine_description.h"
 #include "grey.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/alloc.h"
 #include "util/compare.h"
 #include "util/dump_mask.h"
@@ -495,14 +496,34 @@ FDRCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
 
 } // namespace
 
+static
+size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
+    size_t rv = 0;
+    for (const auto &lit : lits) {
+        rv = max(rv, lit.msk.size());
+    }
+    return rv;
+}
+
+static
+void setHistoryRequired(hwlmStreamingControl &stream_ctl,
+                        const vector<hwlmLiteral> &lits) {
+    size_t max_mask_len = maxMaskLen(lits);
+
+    // we want enough history to manage the longest literal and the longest
+    // mask.
+    stream_ctl.literal_history_required = max(maxLen(lits), max_mask_len) - 1;
+}
+
 static
 aligned_unique_ptr<FDR>
 fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
                       const target_t &target, const Grey &grey, u32 hint,
                       hwlmStreamingControl *stream_control) {
     pair<aligned_unique_ptr<u8>, size_t> link(nullptr, 0);
+
     if (stream_control) {
-        link = fdrBuildTableStreaming(lits, *stream_control);
+        setHistoryRequired(*stream_control, lits);
     }
 
     DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
diff --git a/src/fdr/fdr_confirm.h b/src/fdr/fdr_confirm.h
index 865218b4..6ce85afd 100644
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,19 +52,18 @@ typedef enum LitInfoFlags {
 /**
  * \brief Structure describing a literal, linked to by FDRConfirm.
  *
- * This structure is followed in memory by a variable-sized string prefix at
- * LitInfo::s, for strings that are longer than CONF_TYPE.
+ * This structure is followed in memory by a variable-sized string prefix, for
+ * strings that are longer than CONF_TYPE.
  */
 struct LitInfo {
     CONF_TYPE v;
     CONF_TYPE msk;
     hwlm_group_t groups;
-    u32 size;
     u32 id; // literal ID as passed in
+    u8 size;
     u8 flags; /* LitInfoFlags */
     u8 next;
     u8 extended_size;
-    u8 s[1]; // literal prefix, which continues "beyond" this struct.
 };
 
 #define FDRC_FLAG_NO_CONFIRM 1
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 23437fe2..e77c46d1 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -107,7 +107,7 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
             info.extended_size = verify_u8(lit.msk.size());
         }
         info.flags = flags;
-        info.size = verify_u32(lit.s.size());
+        info.size = verify_u8(lit.s.size());
         info.groups = lit.groups;
 
         // these are built up assuming a LE machine
@@ -333,13 +333,13 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
             const string &t = lits[litIdx].s;
             if (t.size() > sizeof(CONF_TYPE)) {
                 size_t prefix_len = t.size() - sizeof(CONF_TYPE);
-                memcpy(&finalLI.s[0], t.c_str(), prefix_len);
-                ptr = &finalLI.s[0] + prefix_len;
+                memcpy(ptr, t.c_str(), prefix_len);
+                ptr += prefix_len;
             }
 
             ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
             if (next(i) == e) {
-                finalLI.next = 0x0;
+                finalLI.next = 0;
             } else {
                 // our next field represents an adjustment on top of
                 // current address + the actual size of the literal
diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h
index 9b1df593..87ade9fe 100644
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@@ -74,10 +74,8 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
         if (loc < buf) {
             u32 full_overhang = buf - loc;
 
-            const u8 *history = caseless ? a->buf_history_nocase
-                                         : a->buf_history;
-            size_t len_history = caseless ? a->len_history_nocase
-                                          : a->len_history;
+            const u8 *history = a->buf_history;
+            size_t len_history = a->len_history;
 
             // can't do a vectored confirm either if we don't have
             // the bytes
@@ -88,7 +86,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             // as for the regular case, no need to do a full confirm if
             // we're a short literal
             if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                const u8 *s1 = li->s;
+                const u8 *s1 = (const u8 *)li + sizeof(*li);
                 const u8 *s2 = s1 + full_overhang;
                 const u8 *loc1 = history + len_history - full_overhang;
                 const u8 *loc2 = buf;
@@ -108,7 +106,8 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
 
             // if string < conf_type we don't need regular string cmp
             if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                if (cmpForward(loc, li->s, li->size - sizeof(CONF_TYPE),
+                const u8 *s = (const u8 *)li + sizeof(*li);
+                if (cmpForward(loc, s, li->size - sizeof(CONF_TYPE),
                                caseless)) {
                     goto out;
                 }
@@ -123,8 +122,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
             if (loc2 < buf) {
                 u32 full_overhang = buf - loc2;
-                size_t len_history = caseless ? a->len_history_nocase
-                                              : a->len_history;
+                size_t len_history = a->len_history;
                 if (full_overhang > len_history) {
                     goto out;
                 }
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index 6272b69e..3bf82837 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -100,8 +100,6 @@ struct FDR_Runtime_Args {
     size_t len;
     const u8 *buf_history;
     size_t len_history;
-    const u8 *buf_history_nocase;
-    size_t len_history_nocase;
     size_t start_offset;
     HWLMCallback cb;
     void *ctxt;
diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
deleted file mode 100644
index b2e1656c..00000000
--- a/src/fdr/fdr_streaming_compile.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "fdr_internal.h"
-#include "fdr_streaming_internal.h"
-#include "fdr_compile_internal.h"
-#include "hwlm/hwlm_build.h"
-#include "util/alloc.h"
-#include "util/bitutils.h"
-#include "util/target_info.h"
-#include "util/verify_types.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <deque>
-#include <set>
-
-#include <boost/dynamic_bitset.hpp>
-
-using namespace std;
-using boost::dynamic_bitset;
-
-namespace ue2 {
-
-namespace {
-struct LongLitOrder {
-    bool operator()(const hwlmLiteral &i1, const hwlmLiteral &i2) const {
-        if (i1.nocase != i2.nocase) {
-            return i1.nocase < i2.nocase;
-        } else {
-            return i1.s < i2.s;
-        }
-    }
-};
-}
-
-static
-bool hwlmLitEqual(const hwlmLiteral &l1, const hwlmLiteral &l2) {
-    return l1.s == l2.s && l1.nocase == l2.nocase;
-}
-
-static
-u32 roundUpToPowerOfTwo(u32 x) {
-    x -= 1;
-    x |= (x >> 1);
-    x |= (x >> 2);
-    x |= (x >> 4);
-    x |= (x >> 8);
-    x |= (x >> 16);
-    return x + 1;
-}
-
-/**
- * \brief Creates a long literals vector containing all literals of length > max_len.
- *
- * The last char of each literal is trimmed as we're not interested in full
- * matches, only partial matches.
- *
- * Literals are sorted (by caseful/caseless, then lexicographical order) and
- * made unique.
- *
- * The ID of each literal is set to its position in the vector.
- *
- * \return False if there aren't any long literals.
- */
-static
-bool setupLongLits(const vector<hwlmLiteral> &lits,
-                   vector<hwlmLiteral> &long_lits, size_t max_len) {
-    long_lits.reserve(lits.size());
-    for (const auto &lit : lits) {
-        if (lit.s.length() > max_len) {
-            hwlmLiteral tmp = lit; // copy
-            tmp.s.pop_back();
-            tmp.id = 0; // recalc later
-            tmp.groups = 0; // filled in later by hash bucket(s)
-            long_lits.push_back(move(tmp));
-        }
-    }
-
-    if (long_lits.empty()) {
-        return false;
-    }
-
-    // sort long_literals by caseful/caseless and in lexicographical order,
-    // remove duplicates
-    stable_sort(long_lits.begin(), long_lits.end(), LongLitOrder());
-    auto new_end = unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
-    long_lits.erase(new_end, long_lits.end());
-
-    // fill in ids; not currently used
-    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
-        i->id = distance(long_lits.begin(), i);
-    }
-    return true;
-}
-
-// boundaries are the 'start' boundaries for each 'mode'
-// so boundary[CASEFUL] is the index one above the largest caseful index
-// positions[CASEFUL] is the # of positions in caseful strings (stream)
-// hashedPositions[CASEFUL] is the # of positions in caseful strings
-//                          (not returned - a temporary)
-// hashEntries[CASEFUL] is the # of positions hashed for caseful strings
-//                    (rounded up to the nearest power of two)
-static
-void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
-                 u32 *boundaries, u32 *positions, u32 *hashEntries) {
-    u32 hashedPositions[MAX_MODES];
-
-    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
-        boundaries[m] = verify_u32(long_lits.size());
-        positions[m] = 0;
-        hashedPositions[m] = 0;
-    }
-
-    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
-        if (i->nocase) {
-            boundaries[CASEFUL] = verify_u32(distance(long_lits.begin(), i));
-            break;
-        }
-    }
-
-    for (const auto &lit : long_lits) {
-        Modes m = lit.nocase ? CASELESS : CASEFUL;
-        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
-            hashedPositions[m]++;
-        }
-        positions[m] += lit.s.size();
-    }
-
-    for (u32 m = CASEFUL; m < MAX_MODES; m++) {
-        hashEntries[m] = hashedPositions[m]
-                ? roundUpToPowerOfTwo(MAX(4096, hashedPositions[m]))
-                : 0;
-    }
-
-#ifdef DEBUG_COMPILE
-    printf("analyzeLits:\n");
-    for (Modes m = CASEFUL; m < MAX_MODES; m++) {
-        printf("mode %s boundary %d positions %d hashedPositions %d "
-               "hashEntries %d\n",
-               (m == CASEFUL) ? "caseful" : "caseless", boundaries[m],
-               positions[m], hashedPositions[m], hashEntries[m]);
-    }
-    printf("\n");
-#endif
-}
-
-static
-u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, Modes m) {
-    return streaming_hash((const u8 *)l.s.c_str() + offset, max_len, m);
-}
-
-// sort by 'distance from start'
-namespace {
-struct OffsetIDFromEndOrder {
-    const vector<hwlmLiteral> &lits; // not currently used
-    explicit OffsetIDFromEndOrder(const vector<hwlmLiteral> &lits_in)
-        : lits(lits_in) {}
-    bool operator()(const pair<u32, u32> &i1, const pair<u32, u32> &i2) const {
-        if (i1.second != i2.second) {
-            // longest is 'first', so > not <
-            return i1.second > i2.second;
-        }
-        return i1.first < i2.first;
-    }
-};
-}
-
-static
-void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
-                FDRSHashEntry *tab, size_t numEntries, Modes mode,
-                map<u32, u32> &litToOffsetVal) {
-    const u32 nbits = lg2(numEntries);
-    map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
-    map<u32, u64a> bucketToBitfield;
-
-    for (const auto &lit : long_lits) {
-        if ((mode == CASELESS) != lit.nocase) {
-            continue;
-        }
-        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
-            u32 h = hashLit(lit, j, max_len, mode);
-            u32 h_ent = h & ((1U << nbits) - 1);
-            u32 h_low = (h >> nbits) & 63;
-            bucketToLitOffPairs[h_ent].emplace_back(lit.id, j);
-            bucketToBitfield[h_ent] |= (1ULL << h_low);
-        }
-    }
-
-    // this used to be a set<u32>, but a bitset is much much faster given that
-    // we're using it only for membership testing.
-    dynamic_bitset<> filledBuckets(numEntries); // all bits zero by default.
-
-    // sweep out bitfield entries and save the results swapped accordingly
-    // also, anything with bitfield entries is put in filledBuckets
-    for (const auto &m : bucketToBitfield) {
-        const u32 &bucket = m.first;
-        const u64a &contents = m.second;
-        tab[bucket].bitfield = contents;
-        filledBuckets.set(bucket);
-    }
-
-    // store out all our chains based on free values in our hash table.
-    // find nearest free locations that are empty (there will always be more
-    // entries than strings, at present)
-    for (auto &m : bucketToLitOffPairs) {
-        u32 bucket = m.first;
-        deque<pair<u32, u32>> &d = m.second;
-
-        // sort d by distance of the residual string (len minus our depth into
-        // the string). We need to put the 'furthest back' string first...
-        stable_sort(d.begin(), d.end(), OffsetIDFromEndOrder(long_lits));
-
-        while (1) {
-            // first time through is always at bucket, then we fill in links
-            filledBuckets.set(bucket);
-            FDRSHashEntry *ent = &tab[bucket];
-            u32 lit_id = d.front().first;
-            u32 offset = d.front().second;
-
-            ent->state = verify_u32(litToOffsetVal[lit_id] + offset + max_len);
-            ent->link = (u32)LINK_INVALID;
-
-            d.pop_front();
-            if (d.empty()) {
-                break;
-            }
-            // now, if there is another value
-            // find a bucket for it and put in 'bucket' and repeat
-            // all we really need to do is find something not in filledBuckets,
-            // ideally something close to bucket
-            // we search backward and forward from bucket, trying to stay as
-            // close as possible.
-            UNUSED bool found = false;
-            int bucket_candidate = 0;
-            for (u32 k = 1; k < numEntries * 2; k++) {
-                bucket_candidate = bucket + (((k & 1) == 0)
-                        ? (-(int)k / 2) : (k / 2));
-                if (bucket_candidate < 0 ||
-                    (size_t)bucket_candidate >= numEntries) {
-                    continue;
-                }
-                if (!filledBuckets.test(bucket_candidate)) {
-                    found = true;
-                    break;
-                }
-            }
-
-            assert(found);
-            bucket = bucket_candidate;
-            ent->link = bucket;
-        }
-    }
-}
-
-static
-size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
-    size_t rv = 0;
-    for (const auto &lit : lits) {
-        rv = max(rv, lit.msk.size());
-    }
-    return rv;
-}
-
-pair<aligned_unique_ptr<u8>, size_t>
-fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
-                       hwlmStreamingControl &stream_control) {
-    // refuse to compile if we are forced to have smaller than minimum
-    // history required for long-literal support, full stop
-    // otherwise, choose the maximum of the preferred history quantity
-    // (currently a fairly extravagant 32) or the already used history
-    // quantity - subject to the limitation of stream_control.history_max
-
-    const size_t MIN_HISTORY_REQUIRED = 32;
-
-    if (MIN_HISTORY_REQUIRED > stream_control.history_max) {
-        throw std::logic_error("Cannot set history to minimum history required");
-    }
-
-    size_t max_len =
-        MIN(stream_control.history_max,
-            MAX(MIN_HISTORY_REQUIRED, stream_control.history_min));
-    assert(max_len >= MIN_HISTORY_REQUIRED);
-    size_t max_mask_len = maxMaskLen(lits);
-
-    vector<hwlmLiteral> long_lits;
-    if (!setupLongLits(lits, long_lits, max_len) || false) {
-        // "Don't need to do anything" path, not really a fail
-        DEBUG_PRINTF("Streaming literal path produces no table\n");
-
-        // we want enough history to manage the longest literal and the longest
-        // mask.
-        stream_control.literal_history_required =
-                    max(maxLen(lits), max_mask_len) - 1;
-        stream_control.literal_stream_state_required = 0;
-        return {nullptr, size_t{0}};
-    }
-
-    // Ensure that we have enough room for the longest mask.
-    if (max_mask_len) {
-        max_len = max(max_len, max_mask_len - 1);
-    }
-
-    u32 boundary[MAX_MODES];
-    u32 positions[MAX_MODES];
-    u32 hashEntries[MAX_MODES];
-
-    analyzeLits(long_lits, max_len, boundary, positions, hashEntries);
-
-    // first assess the size and find our caseless threshold
-    size_t headerSize = ROUNDUP_16(sizeof(FDRSTableHeader));
-
-    size_t litTabOffset = headerSize;
-
-    size_t litTabNumEntries = long_lits.size() + 1;
-    size_t litTabSize = ROUNDUP_16(litTabNumEntries * sizeof(FDRSLiteral));
-
-    size_t wholeLitTabOffset = litTabOffset + litTabSize;
-    size_t totalWholeLitTabSize = ROUNDUP_16(positions[CASEFUL] +
-                                             positions[CASELESS]);
-
-    size_t htOffset[MAX_MODES];
-    size_t htSize[MAX_MODES];
-
-    htOffset[CASEFUL] = wholeLitTabOffset + totalWholeLitTabSize;
-    htSize[CASEFUL] = hashEntries[CASEFUL] * sizeof(FDRSHashEntry);
-    htOffset[CASELESS] = htOffset[CASEFUL] + htSize[CASEFUL];
-    htSize[CASELESS] = hashEntries[CASELESS] * sizeof(FDRSHashEntry);
-
-    size_t tabSize = ROUNDUP_16(htOffset[CASELESS] + htSize[CASELESS]);
-
-    // need to add +2 to both of these to allow space for the actual largest
-    // value as well as handling the fact that we add one to the space when
-    // storing out a position to allow zero to mean "no stream state value"
-    u8 streamBits[MAX_MODES];
-    streamBits[CASEFUL] = lg2(roundUpToPowerOfTwo(positions[CASEFUL] + 2));
-    streamBits[CASELESS] = lg2(roundUpToPowerOfTwo(positions[CASELESS] + 2));
-    u32 tot_state_bytes = (streamBits[CASEFUL] + streamBits[CASELESS] + 7) / 8;
-
-    auto secondaryTable = aligned_zmalloc_unique<u8>(tabSize);
-    assert(secondaryTable); // otherwise would have thrown std::bad_alloc
-
-    // then fill it in
-    u8 * ptr = secondaryTable.get();
-    FDRSTableHeader * header = (FDRSTableHeader *)ptr;
-    // fill in header
-    header->pseudoEngineID = (u32)0xffffffff;
-    header->N = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
-    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
-        header->boundary[m] = boundary[m];
-        header->hashOffset[m] = verify_u32(htOffset[m]);
-        header->hashNBits[m] = lg2(hashEntries[m]);
-        header->streamStateBits[m] = streamBits[m];
-    }
-    assert(tot_state_bytes < sizeof(u64a));
-    header->streamStateBytes = verify_u8(tot_state_bytes); // u8
-
-    ptr += headerSize;
-
-    // now fill in the rest
-
-    FDRSLiteral * litTabPtr = (FDRSLiteral *)ptr;
-    ptr += litTabSize;
-
-    map<u32, u32> litToOffsetVal;
-    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
-        u32 entry = verify_u32(i - long_lits.begin());
-        u32 offset = verify_u32(ptr - secondaryTable.get());
-
-        // point the table entry to the string location
-        litTabPtr[entry].offset = offset;
-
-        litToOffsetVal[entry] = offset;
-
-        // copy the string into the string location
-        memcpy(ptr, i->s.c_str(), i->s.size());
-
-        ptr += i->s.size(); // and the string location
-    }
-
-    // fill in final lit table entry with current ptr (serves as end value)
-    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable.get());
-
-    // fill hash tables
-    ptr = secondaryTable.get() + htOffset[CASEFUL];
-    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
-        fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
-                   (Modes)m, litToOffsetVal);
-        ptr += htSize[m];
-    }
-
-    // tell the world what we did
-    stream_control.literal_history_required = max_len;
-    stream_control.literal_stream_state_required = tot_state_bytes;
-    return {move(secondaryTable), tabSize};
-}
-
-} // namespace ue2
diff --git a/src/fdr/fdr_streaming_internal.h b/src/fdr/fdr_streaming_internal.h
deleted file mode 100644
index 11b07b56..00000000
--- a/src/fdr/fdr_streaming_internal.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef FDR_STREAMING_INTERNAL_H
-#define FDR_STREAMING_INTERNAL_H
-
-#include "ue2common.h"
-#include "fdr_internal.h"
-#include "util/unaligned.h"
-
-// tertiary table:
-// a header (FDRSTableHeader)
-// long_lits.size()+1 entries holding an offset to the string in the
-//       'whole literal table' (FDRSLiteral structure)
-// the whole literal table - every string packed in (freeform)
-// hash table (caseful) (FDRSHashEntry)
-// hash table (caseless) (FDRSHashEntry)
-
-enum Modes {
-    CASEFUL = 0,
-    CASELESS = 1,
-    MAX_MODES = 2
-};
-
-// We have one of these structures hanging off the 'link' of our secondary
-// FDR table that handles streaming strings
-struct FDRSTableHeader {
-    u32 pseudoEngineID; // set to 0xffffffff to indicate this isn't an FDR
-
-    // string id one beyond the maximum entry for this type of literal
-    // boundary[CASEFUL] is the end of the caseful literals
-    // boundary[CASELESS] is the end of the caseless literals and one beyond
-    // the largest literal id (the size of the littab)
-    u32 boundary[MAX_MODES];
-
-    // offsets are 0 if no such table exists
-    // offset from the base of the tertiary structure to the hash table
-    u32 hashOffset[MAX_MODES];
-    u32 hashNBits[MAX_MODES]; // lg2 of the size of the hash table
-
-    u8 streamStateBits[MAX_MODES];
-    u8 streamStateBytes; // total size of packed stream state in bytes
-    u8 N; // prefix lengths
-    u16 pad;
-};
-
-// One of these structures per literal entry in our secondary FDR table.
-struct FDRSLiteral {
-    u32 offset;
-    // potentially - another u32 to point to the 'next lesser included literal'
-    // which would be a literal that overlaps this one in such a way that a
-    // failure to match _this_ literal can leave us in a state that we might
-    // still match that literal. Offset information might also be called for,
-    // in which case we might be wanting to use a FDRSLiteralOffset
-};
-
-typedef u32 FDRSLiteralOffset;
-
-#define LINK_INVALID 0xffffffff
-
-// One of these structures per hash table entry in our secondary FDR table
-struct FDRSHashEntry {
-    u64a bitfield;
-    FDRSLiteralOffset state;
-    u32 link;
-};
-
-static really_inline
-u32 get_start_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
-    return m == CASEFUL ? 0 : h->boundary[m-1];
-}
-
-static really_inline
-u32 get_end_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
-    return h->boundary[m];
-}
-
-static really_inline
-const struct FDRSLiteral * getLitTab(const struct FDRSTableHeader * h) {
-    return (const struct FDRSLiteral *) (((const u8 *)h) +
-            ROUNDUP_16(sizeof(struct FDRSTableHeader)));
-}
-
-static really_inline
-u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, enum Modes m) {
-    return getLitTab(h)[get_start_lit_idx(h, m)].offset;
-}
-
-static really_inline
-u32 packStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
-    return v - getBaseOffsetOfLits(h, m) + 1;
-}
-
-static really_inline
-u32 unpackStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
-    return v + getBaseOffsetOfLits(h, m) - 1;
-}
-
-static really_inline
-u32 has_bit(const struct FDRSHashEntry * ent, u32 bit) {
-    return (ent->bitfield >> bit) & 0x1;
-}
-
-static really_inline
-u32 streaming_hash(const u8 *ptr, UNUSED size_t len, enum Modes mode) {
-    const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
-    const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
-    assert(len >= 32);
-
-    u64a v1 = unaligned_load_u64a(ptr);
-    u64a v2 = unaligned_load_u64a(ptr + 8);
-    u64a v3 = unaligned_load_u64a(ptr + 16);
-    if (mode == CASELESS) {
-        v1 &= CASEMASK;
-        v2 &= CASEMASK;
-        v3 &= CASEMASK;
-    }
-    v1 *= MULTIPLIER;
-    v2 *= (MULTIPLIER*MULTIPLIER);
-    v3 *= (MULTIPLIER*MULTIPLIER*MULTIPLIER);
-    v1 >>= 32;
-    v2 >>= 32;
-    v3 >>= 32;
-    return v1 ^ v2 ^ v3;
-}
-
-#endif
diff --git a/src/fdr/fdr_streaming_runtime.h b/src/fdr/fdr_streaming_runtime.h
deleted file mode 100644
index 8e264c76..00000000
--- a/src/fdr/fdr_streaming_runtime.h
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef FDR_STREAMING_RUNTIME_H
-#define FDR_STREAMING_RUNTIME_H
-
-#include "fdr_streaming_internal.h"
-#include "util/partial_store.h"
-
-#include <string.h>
-
-static really_inline
-const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) {
-    const u8 * linkPtr = ((const u8 *)fdr) + fdr->link;
-    // test if it's not really a engineID, but a 'pseudo engine id'
-    assert(*(const u32 *)linkPtr == 0xffffffff);
-    assert(linkPtr);
-    return (const struct FDRSTableHeader *)linkPtr;
-}
-
-// Reads from stream state and unpacks values into stream state table.
-static really_inline
-void getStreamStates(const struct FDRSTableHeader * streamingTable,
-                     const u8 * stream_state, u32 * table) {
-    assert(streamingTable);
-    assert(stream_state);
-    assert(table);
-
-    u8 ss_bytes = streamingTable->streamStateBytes;
-    u8 ssb = streamingTable->streamStateBits[CASEFUL];
-    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
-    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
-
-#if defined(ARCH_32_BIT)
-    // On 32-bit hosts, we may be able to avoid having to do any u64a
-    // manipulation at all.
-    if (ss_bytes <= 4) {
-        u32 ssb_mask = (1U << ssb) - 1;
-        u32 streamVal = partial_load_u32(stream_state, ss_bytes);
-        table[CASEFUL] = (u32)(streamVal & ssb_mask);
-        table[CASELESS] = (u32)(streamVal >> ssb);
-        return;
-    }
-#endif
-
-    u64a ssb_mask = (1ULL << ssb) - 1;
-    u64a streamVal = partial_load_u64a(stream_state, ss_bytes);
-    table[CASEFUL] = (u32)(streamVal & ssb_mask);
-    table[CASELESS] = (u32)(streamVal >> (u64a)ssb);
-}
-
-#ifndef NDEBUG
-// Defensive checking (used in assert) that these table values don't overflow
-// outside the range available.
-static really_inline UNUSED
-u32 streamingTableOverflow(u32 * table, u8 ssb, u8 ssb_nc) {
-    u32 ssb_mask = (1ULL << (ssb)) - 1;
-    if (table[CASEFUL] & ~ssb_mask) {
-        return 1;
-    }
-    u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1;
-    if (table[CASELESS] & ~ssb_nc_mask) {
-        return 1;
-    }
-    return 0;
-}
-#endif
-
-// Reads from stream state table and packs values into stream state.
-static really_inline
-void setStreamStates(const struct FDRSTableHeader * streamingTable,
-                     u8 * stream_state, u32 * table) {
-    assert(streamingTable);
-    assert(stream_state);
-    assert(table);
-
-    u8 ss_bytes = streamingTable->streamStateBytes;
-    u8 ssb = streamingTable->streamStateBits[CASEFUL];
-    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
-    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
-    assert(!streamingTableOverflow(table, ssb, ssb_nc));
-
-#if defined(ARCH_32_BIT)
-    // On 32-bit hosts, we may be able to avoid having to do any u64a
-    // manipulation at all.
-    if (ss_bytes <= 4) {
-        u32 stagingStreamState = table[CASEFUL];
-        stagingStreamState |= (table[CASELESS] << ssb);
-
-        partial_store_u32(stream_state, stagingStreamState, ss_bytes);
-        return;
-    }
-#endif
-
-    u64a stagingStreamState = (u64a)table[CASEFUL];
-    stagingStreamState |= (u64a)table[CASELESS] << ((u64a)ssb);
-    partial_store_u64a(stream_state, stagingStreamState, ss_bytes);
-}
-
-u32 fdrStreamStateActive(const struct FDR * fdr, const u8 * stream_state) {
-    if (!stream_state) {
-        return 0;
-    }
-    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
-    u8 ss_bytes = streamingTable->streamStateBytes;
-
-    // We just care if there are any bits set, and the test below is faster
-    // than a partial_load_u64a (especially on 32-bit hosts).
-    for (u32 i = 0; i < ss_bytes; i++) {
-        if (*stream_state) {
-            return 1;
-        }
-        ++stream_state;
-    }
-    return 0;
-}
-
-// binary search for the literal index that contains the current state
-static really_inline
-u32 findLitTabEntry(const struct FDRSTableHeader * streamingTable,
-                    u32 stateValue, enum Modes m) {
-    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
-    u32 lo = get_start_lit_idx(streamingTable, m);
-    u32 hi = get_end_lit_idx(streamingTable, m);
-
-    // Now move stateValue back by one so that we're looking for the
-    // litTab entry that includes it the string, not the one 'one past' it
-    stateValue -= 1;
-    assert(lo != hi);
-    assert(litTab[lo].offset <= stateValue);
-    assert(litTab[hi].offset > stateValue);
-
-    // binary search to find the entry e such that:
-    // litTab[e].offsetToLiteral <= stateValue < litTab[e+1].offsetToLiteral
-    while (lo + 1 < hi) {
-        u32 mid = (lo + hi) / 2;
-        if (litTab[mid].offset <= stateValue) {
-            lo = mid;
-        } else { //(litTab[mid].offset > stateValue) {
-            hi = mid;
-        }
-    }
-    assert(litTab[lo].offset <= stateValue);
-    assert(litTab[hi].offset > stateValue);
-    return lo;
-}
-
-static really_inline
-void fdrUnpackStateMode(struct FDR_Runtime_Args *a,
-                        const struct FDRSTableHeader *streamingTable,
-                        const struct FDRSLiteral * litTab,
-                        const u32 *state_table,
-                        const enum Modes m) {
-    if (!state_table[m]) {
-        return;
-    }
-
-    u32 stateValue = unpackStateVal(streamingTable, m, state_table[m]);
-    u32 idx = findLitTabEntry(streamingTable, stateValue, m);
-    size_t found_offset = litTab[idx].offset;
-    const u8 * found_buf = found_offset + (const u8 *)streamingTable;
-    size_t found_sz = stateValue - found_offset;
-    if (m == CASEFUL) {
-        a->buf_history = found_buf;
-        a->len_history = found_sz;
-    } else {
-        a->buf_history_nocase = found_buf;
-        a->len_history_nocase = found_sz;
-    }
-}
-
-static really_inline
-void fdrUnpackState(const struct FDR * fdr, struct FDR_Runtime_Args * a,
-                    const u8 * stream_state) {
-    // nothing to do if there's no stream state for the case
-    if (!stream_state) {
-        return;
-    }
-
-    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
-    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
-
-    u32 state_table[MAX_MODES];
-    getStreamStates(streamingTable, stream_state, state_table);
-
-    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASEFUL);
-    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASELESS);
-}
-
-static really_inline
-u32 do_single_confirm(const struct FDRSTableHeader *streamingTable,
-                      const struct FDR_Runtime_Args *a, u32 hashState,
-                      enum Modes m) {
-    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
-    u32 idx = findLitTabEntry(streamingTable, hashState, m);
-    size_t found_offset = litTab[idx].offset;
-    const u8 * s1 = found_offset + (const u8 *)streamingTable;
-    assert(hashState > found_offset);
-    size_t l1 = hashState - found_offset;
-    const u8 * buf = a->buf;
-    size_t len = a->len;
-    const char nocase = m != CASEFUL;
-
-    if (l1 > len) {
-        const u8 * hist = nocase ? a->buf_history_nocase : a->buf_history;
-        size_t hist_len = nocase ? a->len_history_nocase : a->len_history;
-
-        if (l1 > len+hist_len) {
-            return 0; // Break out - not enough total history
-        }
-
-        size_t overhang = l1 - len;
-        assert(overhang <= hist_len);
-
-        if (cmpForward(hist + hist_len - overhang, s1, overhang, nocase)) {
-            return 0;
-        }
-        s1 += overhang;
-        l1 -= overhang;
-    }
-    // if we got here, we don't need history or we compared ok out of history
-    assert(l1 <= len);
-
-    if (cmpForward(buf + len - l1, s1, l1, nocase)) {
-        return 0;
-    }
-    return hashState; // our new state
-}
-
-static really_inline
-void fdrFindStreamingHash(const struct FDR_Runtime_Args *a,
-                          const struct FDRSTableHeader *streamingTable,
-                          u8 hash_len, u32 *hashes) {
-    u8 tempbuf[128];
-    const u8 *base;
-    if (hash_len > a->len) {
-        assert(hash_len <= 128);
-        size_t overhang = hash_len - a->len;
-        assert(overhang <= a->len_history);
-        memcpy(tempbuf, a->buf_history + a->len_history - overhang, overhang);
-        memcpy(tempbuf + overhang, a->buf, a->len);
-        base = tempbuf;
-    } else {
-        assert(hash_len <= a->len);
-        base = a->buf + a->len - hash_len;
-    }
-
-    if (streamingTable->hashNBits[CASEFUL]) {
-        hashes[CASEFUL] = streaming_hash(base, hash_len, CASEFUL);
-    }
-    if (streamingTable->hashNBits[CASELESS]) {
-        hashes[CASELESS] = streaming_hash(base, hash_len, CASELESS);
-    }
-}
-
-static really_inline
-const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
-                                   u32 h, const enum Modes m) {
-    u32 nbits = streamingTable->hashNBits[m];
-    if (!nbits) {
-        return NULL;
-    }
-
-    u32 h_ent = h & ((1 << nbits) - 1);
-    u32 h_low = (h >> nbits) & 63;
-
-    const struct FDRSHashEntry *tab =
-        (const struct FDRSHashEntry *)((const u8 *)streamingTable
-                                       + streamingTable->hashOffset[m]);
-    const struct FDRSHashEntry *ent = tab + h_ent;
-
-    if (!has_bit(ent, h_low)) {
-        return NULL;
-    }
-
-    return ent;
-}
-
-static really_inline
-void fdrPackStateMode(u32 *state_table, const struct FDR_Runtime_Args *a,
-                      const struct FDRSTableHeader *streamingTable,
-                      const struct FDRSHashEntry *ent, const enum Modes m) {
-    assert(ent);
-    assert(streamingTable->hashNBits[m]);
-
-    const struct FDRSHashEntry *tab =
-        (const struct FDRSHashEntry *)((const u8 *)streamingTable
-                                       + streamingTable->hashOffset[m]);
-
-    while (1) {
-        u32 tmp = 0;
-        if ((tmp = do_single_confirm(streamingTable, a, ent->state, m))) {
-            state_table[m] = packStateVal(streamingTable, m, tmp);
-            break;
-        }
-        if (ent->link == LINK_INVALID) {
-            break;
-        }
-        ent = tab + ent->link;
-    }
-}
-
-static really_inline
-void fdrPackState(const struct FDR *fdr, const struct FDR_Runtime_Args *a,
-                  u8 *stream_state) {
-    // nothing to do if there's no stream state for the case
-    if (!stream_state) {
-        return;
-    }
-
-    // get pointers to the streamer FDR and the tertiary structure
-    const struct FDRSTableHeader *streamingTable = getSHDR(fdr);
-
-    assert(streamingTable->N);
-
-    u32 state_table[MAX_MODES] = {0, 0};
-
-    // if we don't have enough history, we don't need to do anything
-    if (streamingTable->N <= a->len + a->len_history) {
-        u32 hashes[MAX_MODES] = {0, 0};
-
-        fdrFindStreamingHash(a, streamingTable, streamingTable->N, hashes);
-
-        const struct FDRSHashEntry *ent_ful = getEnt(streamingTable,
-                                                    hashes[CASEFUL], CASEFUL);
-        const struct FDRSHashEntry *ent_less = getEnt(streamingTable,
-                                                    hashes[CASELESS], CASELESS);
-
-        if (ent_ful) {
-            fdrPackStateMode(state_table, a, streamingTable, ent_ful,
-                             CASEFUL);
-        }
-
-        if (ent_less) {
-            fdrPackStateMode(state_table, a, streamingTable, ent_less,
-                             CASELESS);
-        }
-    }
-
-    setStreamStates(streamingTable, stream_state, state_table);
-}
-
-#endif
diff --git a/src/grey.cpp b/src/grey.cpp
index bad56b56..340a34bf 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -51,6 +51,7 @@ Grey::Grey(void) :
                    allowLbr(true),
                    allowMcClellan(true),
                    allowSheng(true),
+                   allowMcSheng(true),
                    allowPuff(true),
                    allowLiteral(true),
                    allowRose(true),
@@ -217,6 +218,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowLbr);
         G_UPDATE(allowMcClellan);
         G_UPDATE(allowSheng);
+        G_UPDATE(allowMcSheng);
         G_UPDATE(allowPuff);
         G_UPDATE(allowLiteral);
         G_UPDATE(allowRose);
diff --git a/src/grey.h b/src/grey.h
index 90f5f826..4882af7d 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -51,6 +51,7 @@ struct Grey {
     bool allowLbr;
     bool allowMcClellan;
     bool allowSheng;
+    bool allowMcSheng;
     bool allowPuff;
     bool allowLiteral;
     bool allowRose;
diff --git a/src/hs.cpp b/src/hs.cpp
index 07f6d2c1..f64e867a 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -192,6 +192,14 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
         return HS_COMPILER_ERROR;
     }
 
+#if defined(FAT_RUNTIME)
+    if (!check_ssse3()) {
+        *db = nullptr;
+        *comp_error = generateCompileError("Unsupported architecture", -1);
+        return HS_ARCH_ERROR;
+    }
+#endif
+
     if (!checkMode(mode, comp_error)) {
         *db = nullptr;
         assert(*comp_error); // set by checkMode.
@@ -319,6 +327,13 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         return HS_COMPILER_ERROR;
     }
 
+#if defined(FAT_RUNTIME)
+    if (!check_ssse3()) {
+        *error = generateCompileError("Unsupported architecture", -1);
+        return HS_ARCH_ERROR;
+    }
+#endif
+
     if (!info) {
         *error = generateCompileError("Invalid parameter: info is NULL", -1);
         return HS_COMPILER_ERROR;
@@ -426,6 +441,11 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
 
 extern "C" HS_PUBLIC_API
 hs_error_t hs_free_compile_error(hs_compile_error_t *error) {
+#if defined(FAT_RUNTIME)
+    if (!check_ssse3()) {
+        return HS_ARCH_ERROR;
+    }
+#endif
     freeCompileError(error);
     return HS_SUCCESS;
 }
diff --git a/src/hs_common.h b/src/hs_common.h
index 4bf31146..b25b1842 100644
--- a/src/hs_common.h
+++ b/src/hs_common.h
@@ -435,6 +435,23 @@ hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
  */
 const char *hs_version(void);
 
+/**
+ * Utility function to test the current system architecture.
+ *
+ * Hyperscan requires the Supplemental Streaming SIMD Extensions 3 instruction
+ * set. This function can be called on any x86 platform to determine if the
+ * system provides the required instruction set.
+ *
+ * This function does not test for more advanced features if Hyperscan has
+ * been built for a more specific architecture, for example the AVX2
+ * instruction set.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_ARCH_ERROR if system does not
+ *      support Hyperscan.
+ */
+hs_error_t hs_valid_platform(void);
+
 /**
  * @defgroup HS_ERROR hs_error_t values
  *
@@ -519,6 +536,17 @@ const char *hs_version(void);
  */
 #define HS_SCRATCH_IN_USE       (-10)
 
+/**
+ * Unsupported CPU architecture.
+ *
+ * This error is returned when Hyperscan is able to detect that the current
+ * system does not support the required instruction set.
+ *
+ * At a minimum, Hyperscan requires Supplemental Streaming SIMD Extensions 3
+ * (SSSE3).
+ */
+#define HS_ARCH_ERROR		    (-11)
+
 /** @} */
 
 #ifdef __cplusplus
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
new file mode 100644
index 00000000..939cde1f
--- /dev/null
+++ b/src/hs_valid_platform.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "hs_common.h"
+#include "util/cpuid_flags.h"
+
+HS_PUBLIC_API
+hs_error_t hs_valid_platform(void) {
+    /* Hyperscan requires SSSE3, anything else is a bonus */
+    if (check_ssse3()) {
+        return HS_SUCCESS;
+    } else {
+        return HS_ARCH_ERROR;
+    }
+}
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 2e16f1ac..3c7615a7 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -200,8 +200,7 @@ hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
 
 hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
                                size_t len, size_t start, HWLMCallback cb,
-                               void *ctxt, hwlm_group_t groups,
-                               u8 *stream_state) {
+                               void *ctxt, hwlm_group_t groups) {
     const u8 *hbuf = scratch->core_info.hbuf;
     const size_t hlen = scratch->core_info.hlen;
     const u8 *buf = scratch->core_info.buf;
@@ -234,13 +233,10 @@ hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
             DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
             aa = &t->accel1;
         }
-        // if no active stream state, use acceleration
-        if (!fdrStreamStateActive(HWLM_C_DATA(t), stream_state)) {
-            do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
-        }
+        do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
         DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
                      start);
         return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len,
-                                start, cb, ctxt, groups, stream_state);
+                                start, cb, ctxt, groups);
     }
 }
diff --git a/src/hwlm/hwlm.h b/src/hwlm/hwlm.h
index 009550e9..a17575df 100644
--- a/src/hwlm/hwlm.h
+++ b/src/hwlm/hwlm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -132,8 +132,7 @@ hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
 hwlm_error_t hwlmExecStreaming(const struct HWLM *tab,
                                struct hs_scratch *scratch, size_t len,
                                size_t start, HWLMCallback callback,
-                               void *context, hwlm_group_t groups,
-                               u8 *stream_state);
+                               void *context, hwlm_group_t groups);
 
 #ifdef __cplusplus
 }       /* extern "C" */
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index b1814245..fa6335c9 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -461,7 +461,8 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
     }
 
     const CharReach &cr = reach[min_offset];
-    if (shuftiBuildMasks(cr, &aux->shufti.lo, &aux->shufti.hi) != -1) {
+    if (-1 !=
+        shuftiBuildMasks(cr, (u8 *)&aux->shufti.lo, (u8 *)&aux->shufti.hi)) {
         DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
                      describeClass(cr).c_str(), cr.count(), min_offset);
         aux->shufti.accel_type = ACCEL_SHUFTI;
@@ -469,7 +470,7 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         return;
     }
 
-    truffleBuildMasks(cr, &aux->truffle.mask1, &aux->truffle.mask2);
+    truffleBuildMasks(cr, (u8 *)&aux->truffle.mask1, (u8 *)&aux->truffle.mask2);
     DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
                  describeClass(cr).c_str(), cr.count(), min_offset);
     aux->truffle.accel_type = ACCEL_TRUFFLE;
@@ -523,7 +524,7 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
     }
 
     if (stream_control) { // nullptr if in block mode
-        if (lits.front().s.length() + 1 > stream_control->history_max) {
+        if (lits.front().s.length() > stream_control->history_max + 1) {
             DEBUG_PRINTF("length of %zu too long for history max %zu\n",
                          lits.front().s.length(),
                          stream_control->history_max);
@@ -552,6 +553,12 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
 
     if (stream_control) {
         assert(stream_control->history_min <= stream_control->history_max);
+
+        // We should not have been passed any literals that are too long to
+        // match with a maximally-sized history buffer.
+        assert(all_of(begin(lits), end(lits), [&](const hwlmLiteral &lit) {
+            return lit.s.length() <= stream_control->history_max + 1;
+        }));
     }
 
     // Check that we haven't exceeded the maximum number of literals.
@@ -602,7 +609,6 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
             stream_control->literal_history_required = lit.s.length() - 1;
             assert(stream_control->literal_history_required
                    <= stream_control->history_max);
-            stream_control->literal_stream_state_required = 0;
         }
         eng = move(noodle);
     } else {
diff --git a/src/hwlm/hwlm_build.h b/src/hwlm/hwlm_build.h
index b5bdb0ea..fbf359e6 100644
--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -63,10 +63,6 @@ struct hwlmStreamingControl {
     /** \brief OUT parameter: History required by the literal matcher to
      * correctly match all literals. */
     size_t literal_history_required;
-
-    /** OUT parameter: Stream state required by literal matcher in bytes. Can
-     * be zero, and generally will be small (0-8 bytes). */
-    size_t literal_stream_state_required;
 };
 
 /** \brief Build an \ref HWLM literal matcher runtime structure for a group of
diff --git a/src/hwlm/hwlm_literal.cpp b/src/hwlm/hwlm_literal.cpp
index 9e365a0c..b0968d79 100644
--- a/src/hwlm/hwlm_literal.cpp
+++ b/src/hwlm/hwlm_literal.cpp
@@ -86,6 +86,7 @@ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
                          const vector<u8> &msk_in, const vector<u8> &cmp_in)
     : s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in),
       groups(groups_in), msk(msk_in), cmp(cmp_in) {
+    assert(s.size() <= HWLM_LITERAL_MAX_LEN);
     assert(msk.size() <= HWLM_MASKLEN);
     assert(msk.size() == cmp.size());
 
diff --git a/src/hwlm/hwlm_literal.h b/src/hwlm/hwlm_literal.h
index 7e63a6f3..b7af99d3 100644
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@@ -41,6 +41,9 @@
 
 namespace ue2 {
 
+/** \brief Max length of the literal passed to HWLM. */
+#define HWLM_LITERAL_MAX_LEN 255
+
 /** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
 #define HWLM_MASKLEN 8
 
diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
old mode 100755
new mode 100644
index ba21adc7..d257b530
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -56,15 +56,6 @@ struct path {
 };
 };
 
-static UNUSED
-string describeClasses(const vector<CharReach> &v) {
-    std::ostringstream oss;
-    for (const auto &cr : v) {
-        describeClass(oss, cr);
-    }
-    return oss.str();
-}
-
 static
 void dump_paths(const vector<path> &paths) {
     for (UNUSED const auto &p : paths) {
@@ -482,9 +473,10 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
     }
 
     if (double_byte_ok(info) &&
-        shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
-                               &accel->dshufti.lo1, &accel->dshufti.hi1,
-                               &accel->dshufti.lo2, &accel->dshufti.hi2)) {
+        shuftiBuildDoubleMasks(
+            info.double_cr, info.double_byte, (u8 *)&accel->dshufti.lo1,
+            (u8 *)&accel->dshufti.hi1, (u8 *)&accel->dshufti.lo2,
+            (u8 *)&accel->dshufti.hi2)) {
         accel->accel_type = ACCEL_DSHUFTI;
         accel->dshufti.offset = verify_u8(info.double_offset);
         DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
@@ -520,14 +512,16 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
     }
 
     accel->accel_type = ACCEL_SHUFTI;
-    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo, &accel->shufti.hi)) {
+    if (-1 != shuftiBuildMasks(info.cr, (u8 *)&accel->shufti.lo,
+                               (u8 *)&accel->shufti.hi)) {
         DEBUG_PRINTF("state %hu is shufti\n", this_idx);
         return;
     }
 
     assert(!info.cr.none());
     accel->accel_type = ACCEL_TRUFFLE;
-    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
+    truffleBuildMasks(info.cr, (u8 *)&accel->truffle.mask1,
+                      (u8 *)&accel->truffle.mask2);
     DEBUG_PRINTF("state %hu is truffle\n", this_idx);
 }
 
diff --git a/src/nfa/accel_dfa_build_strat.h b/src/nfa/accel_dfa_build_strat.h
old mode 100755
new mode 100644
diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp
index 6e2b8f41..e99e71a5 100644
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@@ -41,7 +41,7 @@
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
 #include "util/dump_mask.h"
-#include "util/simd_utils.h"
+#include "util/simd_types.h"
 
 #include <cstdio>
 #include <vector>
@@ -147,16 +147,20 @@ const char *accelName(u8 accel_type) {
 }
 
 static
-void dumpShuftiCharReach(FILE *f, const m128 &lo, const m128 &hi) {
+void dumpShuftiCharReach(FILE *f, const u8 *lo, const u8 *hi) {
     CharReach cr = shufti2cr(lo, hi);
     fprintf(f, "count %zu class %s\n", cr.count(),
             describeClass(cr).c_str());
 }
 
 static
-vector<CharReach> shufti2cr_array(const m128 lo_in, const m128 hi_in) {
-    const u8 *lo = (const u8 *)&lo_in;
-    const u8 *hi = (const u8 *)&hi_in;
+vector<CharReach> dshufti2cr_array(const u8 *lo_in, const u8 *hi_in) {
+    u8 lo[16];
+    u8 hi[16];
+    for (u32 i = 0; i < 16; i++) {
+        lo[i] = ~lo_in[i];
+        hi[i] = ~hi_in[i];
+    }
     vector<CharReach> crs(8);
     for (u32 i = 0; i < 256; i++) {
         u32 combined = lo[(u8)i & 0xf] & hi[(u8)i >> 4];
@@ -169,10 +173,10 @@ vector<CharReach> shufti2cr_array(const m128 lo_in, const m128 hi_in) {
 }
 
 static
-void dumpDShuftiCharReach(FILE *f, const m128 &lo1, const m128 &hi1,
-                                   const m128 &lo2, const m128 &hi2) {
-    vector<CharReach> cr1 = shufti2cr_array(not128(lo1), not128(hi1));
-    vector<CharReach> cr2 = shufti2cr_array(not128(lo2), not128(hi2));
+void dumpDShuftiCharReach(FILE *f, const u8 *lo1, const u8 *hi1,
+                                   const u8 *lo2, const u8 *hi2) {
+    vector<CharReach> cr1 = dshufti2cr_array(lo1, hi1);
+    vector<CharReach> cr2 = dshufti2cr_array(lo2, hi2);
     map<CharReach, set<u32> > cr1_group;
     assert(cr1.size() == 8 && cr2.size() == 8);
     for (u32 i = 0; i < 8; i++) {
@@ -208,26 +212,22 @@ void dumpDShuftiCharReach(FILE *f, const m128 &lo1, const m128 &hi1,
 }
 
 static
-void dumpShuftiMasks(FILE *f, const m128 &lo, const m128 &hi) {
-    fprintf(f, "lo %s\n",
-            dumpMask((const u8 *)&lo, 128).c_str());
-    fprintf(f, "hi %s\n",
-            dumpMask((const u8 *)&hi, 128).c_str());
+void dumpShuftiMasks(FILE *f, const u8 *lo, const u8 *hi) {
+    fprintf(f, "lo %s\n", dumpMask(lo, 128).c_str());
+    fprintf(f, "hi %s\n", dumpMask(hi, 128).c_str());
 }
 
 static
-void dumpTruffleCharReach(FILE *f, const m128 &hiset, const m128 &hiclear) {
+void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) {
     CharReach cr = truffle2cr(hiset, hiclear);
     fprintf(f, "count %zu class %s\n", cr.count(),
             describeClass(cr).c_str());
 }
 
 static
-void dumpTruffleMasks(FILE *f, const m128 &hiset, const m128 &hiclear) {
-    fprintf(f, "lo %s\n",
-            dumpMask((const u8 *)&hiset, 128).c_str());
-    fprintf(f, "hi %s\n",
-            dumpMask((const u8 *)&hiclear, 128).c_str());
+void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
+    fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
+    fprintf(f, "hi %s\n", dumpMask(hiclear, 128).c_str());
 }
 
 
@@ -256,23 +256,31 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
         break;
     case ACCEL_SHUFTI: {
         fprintf(f, "\n");
-        dumpShuftiMasks(f, accel.shufti.lo, accel.shufti.hi);
-        dumpShuftiCharReach(f, accel.shufti.lo, accel.shufti.hi);
+        dumpShuftiMasks(f, (const u8 *)&accel.shufti.lo,
+                        (const u8 *)&accel.shufti.hi);
+        dumpShuftiCharReach(f, (const u8 *)&accel.shufti.lo,
+                            (const u8 *)&accel.shufti.hi);
         break;
     }
     case ACCEL_DSHUFTI:
         fprintf(f, "\n");
         fprintf(f, "mask 1\n");
-        dumpShuftiMasks(f, accel.dshufti.lo1, accel.dshufti.hi1);
+        dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo1,
+                        (const u8 *)&accel.dshufti.hi1);
         fprintf(f, "mask 2\n");
-        dumpShuftiMasks(f, accel.dshufti.lo2, accel.dshufti.hi2);
-        dumpDShuftiCharReach(f, accel.dshufti.lo1, accel.dshufti.hi1,
-                             accel.dshufti.lo2, accel.dshufti.hi2);
+        dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo2,
+                        (const u8 *)&accel.dshufti.hi2);
+        dumpDShuftiCharReach(f, (const u8 *)&accel.dshufti.lo1,
+                             (const u8 *)&accel.dshufti.hi1,
+                             (const u8 *)&accel.dshufti.lo2,
+                             (const u8 *)&accel.dshufti.hi2);
         break;
     case ACCEL_TRUFFLE: {
         fprintf(f, "\n");
-        dumpTruffleMasks(f, accel.truffle.mask1, accel.truffle.mask2);
-        dumpTruffleCharReach(f, accel.truffle.mask1, accel.truffle.mask2);
+        dumpTruffleMasks(f, (const u8 *)&accel.truffle.mask1,
+                         (const u8 *)&accel.truffle.mask2);
+        dumpTruffleCharReach(f, (const u8 *)&accel.truffle.mask1,
+                             (const u8 *)&accel.truffle.mask2);
         break;
     }
     case ACCEL_MLVERM:
@@ -297,28 +305,36 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
     case ACCEL_MSSHUFTI:
     case ACCEL_MSGSHUFTI:
         fprintf(f, " len:%u\n", accel.mshufti.len);
-        dumpShuftiMasks(f, accel.mshufti.lo, accel.mshufti.hi);
-        dumpShuftiCharReach(f, accel.mshufti.lo, accel.mshufti.hi);
+        dumpShuftiMasks(f, (const u8 *)&accel.mshufti.lo,
+                        (const u8 *)&accel.mshufti.hi);
+        dumpShuftiCharReach(f, (const u8 *)&accel.mshufti.lo,
+                            (const u8 *)&accel.mshufti.hi);
         break;
     case ACCEL_MDSSHUFTI:
     case ACCEL_MDSGSHUFTI:
         fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
-        dumpShuftiMasks(f, accel.mdshufti.lo, accel.mdshufti.hi);
-        dumpShuftiCharReach(f, accel.mdshufti.lo, accel.mdshufti.hi);
+        dumpShuftiMasks(f, (const u8 *)&accel.mdshufti.lo,
+                        (const u8 *)&accel.mdshufti.hi);
+        dumpShuftiCharReach(f, (const u8 *)&accel.mdshufti.lo,
+                            (const u8 *)&accel.mdshufti.hi);
         break;
     case ACCEL_MLTRUFFLE:
     case ACCEL_MLGTRUFFLE:
     case ACCEL_MSTRUFFLE:
     case ACCEL_MSGTRUFFLE:
         fprintf(f, " len:%u\n", accel.mtruffle.len);
-        dumpTruffleMasks(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
-        dumpTruffleCharReach(f, accel.mtruffle.mask1, accel.mtruffle.mask2);
+        dumpTruffleMasks(f, (const u8 *)&accel.mtruffle.mask1,
+                         (const u8 *)&accel.mtruffle.mask2);
+        dumpTruffleCharReach(f, (const u8 *)&accel.mtruffle.mask1,
+                             (const u8 *)&accel.mtruffle.mask2);
         break;
     case ACCEL_MDSTRUFFLE:
     case ACCEL_MDSGTRUFFLE:
         fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
-        dumpTruffleMasks(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
-        dumpTruffleCharReach(f, accel.mdtruffle.mask1, accel.mdtruffle.mask2);
+        dumpTruffleMasks(f, (const u8 *)&accel.mdtruffle.mask1,
+                         (const u8 *)&accel.mdtruffle.mask2);
+        dumpTruffleCharReach(f, (const u8 *)&accel.mdtruffle.mask1,
+                             (const u8 *)&accel.mdtruffle.mask2);
         break;
     default:
         fprintf(f, "\n");
diff --git a/src/nfa/accelcompile.cpp b/src/nfa/accelcompile.cpp
index 75960dda..32e569ba 100644
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@@ -72,8 +72,8 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
     }
 
     DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
-    if (-1 != shuftiBuildMasks(info.single_stops, &aux->shufti.lo,
-                               &aux->shufti.hi)) {
+    if (-1 != shuftiBuildMasks(info.single_stops, (u8 *)&aux->shufti.lo,
+                               (u8 *)&aux->shufti.hi)) {
         aux->accel_type = ACCEL_SHUFTI;
         aux->shufti.offset = offset;
         DEBUG_PRINTF("shufti built OK\n");
@@ -86,8 +86,8 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
         DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
         aux->accel_type = ACCEL_TRUFFLE;
         aux->truffle.offset = offset;
-        truffleBuildMasks(info.single_stops, &aux->truffle.mask1,
-                          &aux->truffle.mask2);
+        truffleBuildMasks(info.single_stops, (u8 *)&aux->truffle.mask1,
+                          (u8 *)&aux->truffle.mask2);
         return;
     }
 
@@ -212,9 +212,10 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
                      " two-byte literals\n", outs1, outs2);
         aux->accel_type = ACCEL_DSHUFTI;
         aux->dshufti.offset = offset;
-        if (shuftiBuildDoubleMasks(info.double_stop1, info.double_stop2,
-                                   &aux->dshufti.lo1, &aux->dshufti.hi1,
-                                   &aux->dshufti.lo2, &aux->dshufti.hi2)) {
+        if (shuftiBuildDoubleMasks(
+                info.double_stop1, info.double_stop2, (u8 *)&aux->dshufti.lo1,
+                (u8 *)&aux->dshufti.hi1, (u8 *)&aux->dshufti.lo2,
+                (u8 *)&aux->dshufti.hi2)) {
             return;
         }
     }
@@ -372,8 +373,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
 
     switch (info.ma_type) {
     case MultibyteAccelInfo::MAT_LONG:
-        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
-                &aux->mshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
+                (u8 *)&aux->mshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MLSHUFTI;
@@ -381,8 +382,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
         aux->mshufti.len = info.ma_len1;
         return;
     case MultibyteAccelInfo::MAT_LONGGRAB:
-        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
-                &aux->mshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
+                (u8 *)&aux->mshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MLGSHUFTI;
@@ -390,8 +391,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
         aux->mshufti.len = info.ma_len1;
         return;
     case MultibyteAccelInfo::MAT_SHIFT:
-        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
-                &aux->mshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
+                (u8 *)&aux->mshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MSSHUFTI;
@@ -399,8 +400,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
         aux->mshufti.len = info.ma_len1;
         return;
     case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
-                &aux->mshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
+                (u8 *)&aux->mshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MSGSHUFTI;
@@ -408,8 +409,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
         aux->mshufti.len = info.ma_len1;
         return;
     case MultibyteAccelInfo::MAT_DSHIFT:
-        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
-                &aux->mdshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
+                (u8 *)&aux->mdshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MDSSHUFTI;
@@ -418,8 +419,8 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
         aux->mdshufti.len2 = info.ma_len2;
         return;
     case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
-                &aux->mdshufti.hi) == -1) {
+        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
+                (u8 *)&aux->mdshufti.hi) == -1) {
             break;
         }
         aux->accel_type = ACCEL_MDSGSHUFTI;
@@ -441,45 +442,45 @@ void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
             aux->accel_type = ACCEL_MLTRUFFLE;
             aux->mtruffle.offset = offset;
             aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mtruffle.mask2);
             break;
         case MultibyteAccelInfo::MAT_LONGGRAB:
             aux->accel_type = ACCEL_MLGTRUFFLE;
             aux->mtruffle.offset = offset;
             aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mtruffle.mask2);
             break;
         case MultibyteAccelInfo::MAT_SHIFT:
             aux->accel_type = ACCEL_MSTRUFFLE;
             aux->mtruffle.offset = offset;
             aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mtruffle.mask2);
             break;
         case MultibyteAccelInfo::MAT_SHIFTGRAB:
             aux->accel_type = ACCEL_MSGTRUFFLE;
             aux->mtruffle.offset = offset;
             aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mtruffle.mask2);
             break;
         case MultibyteAccelInfo::MAT_DSHIFT:
             aux->accel_type = ACCEL_MDSTRUFFLE;
             aux->mdtruffle.offset = offset;
             aux->mdtruffle.len1 = info.ma_len1;
             aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mdtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mdtruffle.mask2);
             break;
         case MultibyteAccelInfo::MAT_DSHIFTGRAB:
             aux->accel_type = ACCEL_MDSGTRUFFLE;
             aux->mdtruffle.offset = offset;
             aux->mdtruffle.len1 = info.ma_len1;
             aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, &aux->mtruffle.mask1,
-                              &aux->mdtruffle.mask2);
+            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
+                              (u8 *)&aux->mdtruffle.mask2);
             break;
         default:
             // shouldn't happen
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 6a72ae31..7c158b31 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -745,10 +745,10 @@ void clear_repeats(const struct Castle *c, const struct mq *q, u8 *active) {
 }
 
 static really_inline
-char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
-                        enum MatchMode mode) {
+char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
+                       enum MatchMode mode) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
 
     DEBUG_PRINTF("state=%p, streamState=%p\n", q->state, q->streamState);
 
@@ -856,14 +856,14 @@ char nfaExecCastle0_Q_i(const struct NFA *n, struct mq *q, s64a end,
     return mmbit_any_precise(active, c->numRepeats);
 }
 
-char nfaExecCastle0_Q(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end) {
     DEBUG_PRINTF("entry\n");
-    return nfaExecCastle0_Q_i(n, q, end, CALLBACK_OUTPUT);
+    return nfaExecCastle_Q_i(n, q, end, CALLBACK_OUTPUT);
 }
 
-char nfaExecCastle0_Q2(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) {
     DEBUG_PRINTF("entry\n");
-    return nfaExecCastle0_Q_i(n, q, end, STOP_AT_MATCH);
+    return nfaExecCastle_Q_i(n, q, end, STOP_AT_MATCH);
 }
 
 static
@@ -896,9 +896,9 @@ s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
     return sp - 1; /* the repeats are never killed */
 }
 
-char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
+char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     if (q->cur == q->end) {
@@ -959,9 +959,9 @@ char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report) {
     return 1;
 }
 
-char nfaExecCastle0_reportCurrent(const struct NFA *n, struct mq *q) {
+char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     const struct Castle *c = getImplNfa(n);
@@ -969,19 +969,19 @@ char nfaExecCastle0_reportCurrent(const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
-                             struct mq *q) {
+char nfaExecCastle_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     const struct Castle *c = getImplNfa(n);
     return castleInAccept(c, q, report, q_cur_offset(q));
 }
 
-char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q) {
+char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     const struct Castle *c = getImplNfa(n);
@@ -1019,9 +1019,9 @@ char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q) {
 }
 
 
-char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
+char nfaExecCastle_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     const struct Castle *c = getImplNfa(n);
@@ -1038,10 +1038,10 @@ char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecCastle0_initCompressedState(const struct NFA *n, UNUSED u64a offset,
-                                        void *state, UNUSED u8 key) {
+char nfaExecCastle_initCompressedState(const struct NFA *n, UNUSED u64a offset,
+                                       void *state, UNUSED u8 key) {
     assert(n && state);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry\n");
 
     const struct Castle *c = getImplNfa(n);
@@ -1070,10 +1070,10 @@ void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx,
     repeatPack(packed, info, rctrl, offset);
 }
 
-char nfaExecCastle0_queueCompressState(const struct NFA *n, const struct mq *q,
-                                       s64a loc) {
+char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q,
+                                      s64a loc) {
     assert(n && q);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry, loc=%lld\n", loc);
 
     const struct Castle *c = getImplNfa(n);
@@ -1118,11 +1118,10 @@ void subCastleExpandState(const struct Castle *c, const u32 subIdx,
                                  packed + info->packedCtrlSize, offset));
 }
 
-char nfaExecCastle0_expandState(const struct NFA *n, void *dest,
-                                const void *src, u64a offset,
-                                UNUSED u8 key) {
+char nfaExecCastle_expandState(const struct NFA *n, void *dest, const void *src,
+                               u64a offset, UNUSED u8 key) {
     assert(n && dest && src);
-    assert(n->type == CASTLE_NFA_0);
+    assert(n->type == CASTLE_NFA);
     DEBUG_PRINTF("entry, src=%p, dest=%p, offset=%llu\n", src, dest, offset);
 
     const struct Castle *c = getImplNfa(n);
diff --git a/src/nfa/castle.h b/src/nfa/castle.h
index 84d79097..cc7496ca 100644
--- a/src/nfa/castle.h
+++ b/src/nfa/castle.h
@@ -38,24 +38,24 @@ extern "C" {
 struct mq;
 struct NFA;
 
-char nfaExecCastle0_Q(const struct NFA *n, struct mq *q, s64a end);
-char nfaExecCastle0_Q2(const struct NFA *n, struct mq *q, s64a end);
-char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report);
-char nfaExecCastle0_reportCurrent(const struct NFA *n, struct mq *q);
-char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
-                             struct mq *q);
-char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q);
-char nfaExecCastle0_queueInitState(const struct NFA *n, struct mq *q);
-char nfaExecCastle0_initCompressedState(const struct NFA *n, u64a offset,
-                                        void *state, u8 key);
-char nfaExecCastle0_queueCompressState(const struct NFA *nfa,
-                                       const struct mq *q, s64a loc);
-char nfaExecCastle0_expandState(const struct NFA *nfa, void *dest,
-                                const void *src, u64a offset, u8 key);
+char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecCastle_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q);
+char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecCastle_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecCastle_initCompressedState(const struct NFA *n, u64a offset,
+                                       void *state, u8 key);
+char nfaExecCastle_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecCastle_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
 
-#define nfaExecCastle0_testEOD NFA_API_NO_IMPL
-#define nfaExecCastle0_B_Reverse NFA_API_NO_IMPL
-#define nfaExecCastle0_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecCastle_testEOD NFA_API_NO_IMPL
+#define nfaExecCastle_B_Reverse NFA_API_NO_IMPL
+#define nfaExecCastle_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
 #ifdef __cplusplus
 }
diff --git a/src/nfa/castle_dump.cpp b/src/nfa/castle_dump.cpp
index fd1521a5..1514ca8c 100644
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@@ -40,18 +40,18 @@
 #include "shufticompile.h"
 #include "trufflecompile.h"
 #include "util/charreach.h"
+#include "util/dump_util.h"
 #include "util/dump_charclass.h"
 
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
 
-namespace ue2 {
+/* Note: No dot files for castle */
 
-void nfaExecCastle0_dumpDot(const struct NFA *, FILE *,
-                            UNUSED const std::string &base) {
-    // No GraphViz output for Castles.
-}
+using namespace std;
+
+namespace ue2 {
 
 static
 void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
@@ -68,9 +68,11 @@ void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
     fprintf(f, "\n");
 }
 
-void nfaExecCastle0_dumpText(const struct NFA *nfa, FILE *f) {
+void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
     const Castle *c = (const Castle *)getImplNfa(nfa);
 
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     fprintf(f, "Castle multi-tenant repeat engine\n");
     fprintf(f, "\n");
     fprintf(f, "Number of repeat tenants:  %u\n", c->numRepeats);
@@ -86,13 +88,15 @@ void nfaExecCastle0_dumpText(const struct NFA *nfa, FILE *f) {
         fprintf(f, "negated verm, scanning for 0x%02x\n", c->u.verm.c);
         break;
     case CASTLE_SHUFTI: {
-        const CharReach cr = shufti2cr(c->u.shuf.mask_lo, c->u.shuf.mask_hi);
+        const CharReach cr = shufti2cr((const u8 *)&c->u.shuf.mask_lo,
+                                       (const u8 *)&c->u.shuf.mask_hi);
         fprintf(f, "shufti, scanning for %s (%zu chars)\n",
                 describeClass(cr).c_str(), cr.count());
         break;
     }
     case CASTLE_TRUFFLE: {
-        const CharReach cr = truffle2cr(c->u.truffle.mask1, c->u.truffle.mask2);
+        const CharReach cr = truffle2cr((const u8 *)&c->u.truffle.mask1,
+                                        (const u8 *)&c->u.truffle.mask2);
         fprintf(f, "truffle, scanning for %s (%zu chars)\n",
                 describeClass(cr).c_str(), cr.count());
         break;
@@ -113,6 +117,7 @@ void nfaExecCastle0_dumpText(const struct NFA *nfa, FILE *f) {
         fprintf(f, "Sub %u:\n", i);
         dumpTextSubCastle(sub[i], f);
     }
+    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/castle_dump.h b/src/nfa/castle_dump.h
index 94dadec0..06e7e36e 100644
--- a/src/nfa/castle_dump.h
+++ b/src/nfa/castle_dump.h
@@ -31,16 +31,13 @@
 
 #if defined(DUMP_SUPPORT)
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecCastle0_dumpDot(const NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecCastle0_dumpText(const NFA *nfa, FILE *file);
+void nfaExecCastle_dump(const NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 4bddf767..3b40ab9a 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -49,7 +49,6 @@
 #include "util/graph.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
-#include "util/multibit_internal.h"
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
@@ -58,6 +57,7 @@
 #include <stack>
 #include <cassert>
 
+#include <boost/graph/adjacency_list.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
@@ -100,13 +100,15 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
         return;
     }
 
-    if (shuftiBuildMasks(negated, &c->u.shuf.mask_lo, &c->u.shuf.mask_hi) != -1) {
+    if (shuftiBuildMasks(negated, (u8 *)&c->u.shuf.mask_lo,
+                         (u8 *)&c->u.shuf.mask_hi) != -1) {
         c->type = CASTLE_SHUFTI;
         return;
     }
 
     c->type = CASTLE_TRUFFLE;
-    truffleBuildMasks(negated, &c->u.truffle.mask1, &c->u.truffle.mask2);
+    truffleBuildMasks(negated, (u8 *)(u8 *)&c->u.truffle.mask1,
+                      (u8 *)&c->u.truffle.mask2);
 }
 
 static
@@ -576,7 +578,7 @@ buildCastle(const CastleProto &proto,
     total_size += byte_length(stale_iter); // stale sparse iter
 
     aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
-    nfa->type = verify_u8(CASTLE_NFA_0);
+    nfa->type = verify_u8(CASTLE_NFA);
     nfa->length = verify_u32(total_size);
     nfa->nPositions = verify_u32(subs.size());
     nfa->streamStateSize = streamStateSize;
@@ -903,8 +905,8 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
     u32 min_bound = pr.bounds.min; // always finite
     if (min_bound == 0) { // Vacuous case, we can only do this once.
         assert(!edge(g.start, g.accept, g).second);
-        NFAEdge e = add_edge(g.start, g.accept, g).first;
-        g[e].top = top;
+        NFAEdge e = add_edge(g.start, g.accept, g);
+        g[e].tops.insert(top);
         g[u].reports.insert(pr.reports.begin(), pr.reports.end());
         min_bound = 1;
     }
@@ -912,9 +914,9 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
     for (u32 i = 0; i < min_bound; i++) {
         NFAVertex v = add_vertex(g);
         g[v].char_reach = pr.reach;
-        NFAEdge e = add_edge(u, v, g).first;
+        NFAEdge e = add_edge(u, v, g);
         if (u == g.start) {
-            g[e].top = top;
+            g[e].tops.insert(top);
         }
         u = v;
     }
@@ -931,9 +933,9 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
             if (head != u) {
                 add_edge(head, v, g);
             }
-            NFAEdge e = add_edge(u, v, g).first;
+            NFAEdge e = add_edge(u, v, g);
             if (u == g.start) {
-                g[e].top = top;
+                g[e].tops.insert(top);
             }
             u = v;
         }
@@ -978,15 +980,10 @@ unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
     auto g = ue2::make_unique<NGHolder>(proto.kind);
 
     for (const auto &m : proto.repeats) {
-        if (m.first >= NFA_MAX_TOP_MASKS) {
-            DEBUG_PRINTF("top %u too big for an NFA\n", m.first);
-            return nullptr;
-        }
-
         addToHolder(*g, m.first, m.second);
     }
 
-    //dumpGraph("castle_holder.dot", g->g);
+    //dumpGraph("castle_holder.dot", *g);
 
     // Sanity checks.
     assert(allMatchStatesHaveReports(*g));
diff --git a/src/nfa/dfa_build_strat.cpp b/src/nfa/dfa_build_strat.cpp
old mode 100755
new mode 100644
diff --git a/src/nfa/gough.c b/src/nfa/gough.c
index 520aca93..44acd4c2 100644
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@@ -655,12 +655,6 @@ char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     const u8 *cur_buf = sp < 0 ? hend : buffer;
 
-    char report = 1;
-    if (mode == CALLBACK_OUTPUT) {
-        /* we are starting inside the history buffer: matches are suppressed */
-        report = !(sp < 0);
-    }
-
     if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
         /* this is as far as we go */
         q->cur--;
@@ -691,8 +685,7 @@ char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         const u8 *final_look;
         if (goughExec8_i_ni(m, som, &s, cur_buf + sp, local_ep - sp,
-                            offset + sp, cb, context, &final_look,
-                            report ? mode : NO_MATCHES)
+                            offset + sp, cb, context, &final_look, mode)
             == MO_HALT_MATCHING) {
             *(u8 *)q->state = 0;
             return 0;
@@ -724,7 +717,6 @@ char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         if (sp == 0) {
             cur_buf = buffer;
-            report = 1;
         }
 
         if (sp != ep) {
@@ -789,12 +781,6 @@ char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     const u8 *cur_buf = sp < 0 ? hend : buffer;
 
-    char report = 1;
-    if (mode == CALLBACK_OUTPUT) {
-        /* we are starting inside the history buffer: matches are suppressed */
-        report = !(sp < 0);
-    }
-
     assert(q->cur);
     if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
         /* this is as far as we go */
@@ -822,10 +808,8 @@ char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         /* do main buffer region */
         const u8 *final_look;
         if (goughExec16_i_ni(m, som, &s, cur_buf + sp, local_ep - sp,
-                             offset + sp, cb, context, &final_look,
-                             report ? mode : NO_MATCHES)
+                             offset + sp, cb, context, &final_look, mode)
             == MO_HALT_MATCHING) {
-            assert(report);
             *(u16 *)q->state = 0;
             return 0;
         }
@@ -856,7 +840,6 @@ char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         if (sp == 0) {
             cur_buf = buffer;
-            report = 1;
         }
 
         if (sp != ep) {
diff --git a/src/nfa/goughcompile_dump.cpp b/src/nfa/goughcompile_dump.cpp
index dd76b9ec..cb361cdb 100644
--- a/src/nfa/goughcompile_dump.cpp
+++ b/src/nfa/goughcompile_dump.cpp
@@ -275,7 +275,7 @@ void dump_vars(const GoughGraph &g, const string &base, const Grey &grey) {
 }
 
 void dump(const GoughGraph &g, const string &base, const Grey &grey) {
-    if (!grey.dumpFlags) {
+    if (!(grey.dumpFlags & Grey::DUMP_INT_GRAPH)) {
         return;
     }
 
@@ -311,9 +311,9 @@ void dump_block(FILE *f, const gough_edge_id &e,
     }
 }
 
-void dump_blocks(const map<gough_edge_id, vector<gough_ins> > &blocks,
+void dump_blocks(const map<gough_edge_id, vector<gough_ins>> &blocks,
                  const string &base, const Grey &grey) {
-    if (!grey.dumpFlags) {
+    if (!(grey.dumpFlags & Grey::DUMP_INT_GRAPH)) {
         return;
     }
 
diff --git a/src/nfa/goughdump.cpp b/src/nfa/goughdump.cpp
index 4e6e5425..1b37a0b1 100644
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@@ -37,6 +37,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 #include "util/unaligned.h"
 
 #include <cctype>
@@ -259,8 +260,8 @@ void dumpTransitions(const NFA *nfa, FILE *f,
     fprintf(f, "\n");
 }
 
-void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f,
-                           UNUSED const string &base) {
+static
+void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
     assert(nfa->type == GOUGH_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -279,6 +280,7 @@ void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f,
     fprintf(f, "}\n");
 }
 
+static
 void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
 
     assert(nfa->type == GOUGH_NFA_8);
@@ -303,8 +305,8 @@ void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
-void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f,
-                            UNUSED const string &base) {
+static
+void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
     assert(nfa->type == GOUGH_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -323,6 +325,7 @@ void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f,
     fprintf(f, "}\n");
 }
 
+static
 void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
     assert(nfa->type == GOUGH_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
@@ -348,4 +351,24 @@ void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
+void nfaExecGough16_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == GOUGH_NFA_16);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    nfaExecGough16_dumpText(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    nfaExecGough16_dumpDot(nfa, f);
+    fclose(f);
+}
+
+void nfaExecGough8_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == GOUGH_NFA_8);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    nfaExecGough8_dumpText(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    nfaExecGough8_dumpDot(nfa, f);
+    fclose(f);
+}
+
 } // namespace ue2
diff --git a/src/nfa/goughdump.h b/src/nfa/goughdump.h
index b96938e4..2d204d5a 100644
--- a/src/nfa/goughdump.h
+++ b/src/nfa/goughdump.h
@@ -39,12 +39,8 @@ struct NFA;
 
 namespace ue2 {
 
-void nfaExecGough8_dumpDot(const NFA *nfa, FILE *file,
-                           const std::string &base);
-void nfaExecGough16_dumpDot(const NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecGough8_dumpText(const NFA *nfa, FILE *file);
-void nfaExecGough16_dumpText(const NFA *nfa, FILE *file);
+void nfaExecGough8_dump(const NFA *nfa, const std::string &base);
+void nfaExecGough16_dump(const NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/lbr.c b/src/nfa/lbr.c
index 07e59239..3075be33 100644
--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@@ -307,7 +307,7 @@ char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
                    UNUSED size_t begin, UNUSED size_t end,
                    UNUSED size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Dot);
+    assert(nfa->type == LBR_NFA_DOT);
     // Nothing can kill a dot!
     return 0;
 }
@@ -316,7 +316,7 @@ static really_inline
 char lbrRevScanVerm(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end, size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Verm);
+    assert(nfa->type == LBR_NFA_VERM);
     const struct lbr_verm *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -340,7 +340,7 @@ static really_inline
 char lbrRevScanNVerm(const struct NFA *nfa, const u8 *buf,
                      size_t begin, size_t end, size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_NVerm);
+    assert(nfa->type == LBR_NFA_NVERM);
     const struct lbr_verm *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -365,7 +365,7 @@ char lbrRevScanShuf(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end,
                     size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Shuf);
+    assert(nfa->type == LBR_NFA_SHUF);
     const struct lbr_shuf *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -389,7 +389,7 @@ char lbrRevScanTruf(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end,
                     size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Truf);
+    assert(nfa->type == LBR_NFA_TRUF);
     const struct lbr_truf *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -413,7 +413,7 @@ char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
                    UNUSED size_t begin, UNUSED size_t end,
                    UNUSED size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Dot);
+    assert(nfa->type == LBR_NFA_DOT);
     // Nothing can kill a dot!
     return 0;
 }
@@ -422,7 +422,7 @@ static really_inline
 char lbrFwdScanVerm(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end, size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Verm);
+    assert(nfa->type == LBR_NFA_VERM);
     const struct lbr_verm *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -446,7 +446,7 @@ static really_inline
 char lbrFwdScanNVerm(const struct NFA *nfa, const u8 *buf,
                      size_t begin, size_t end, size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_NVerm);
+    assert(nfa->type == LBR_NFA_NVERM);
     const struct lbr_verm *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -471,7 +471,7 @@ char lbrFwdScanShuf(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end,
                     size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Shuf);
+    assert(nfa->type == LBR_NFA_SHUF);
     const struct lbr_shuf *l = getImplNfa(nfa);
 
     if (begin == end) {
@@ -495,7 +495,7 @@ char lbrFwdScanTruf(const struct NFA *nfa, const u8 *buf,
                     size_t begin, size_t end,
                     size_t *loc) {
     assert(begin <= end);
-    assert(nfa->type == LBR_NFA_Truf);
+    assert(nfa->type == LBR_NFA_TRUF);
     const struct lbr_truf *l = getImplNfa(nfa);
 
     if (begin == end) {
diff --git a/src/nfa/lbr_dump.cpp b/src/nfa/lbr_dump.cpp
index 3412ddf5..0948e122 100644
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@@ -42,38 +42,17 @@
 #include "trufflecompile.h"
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
 
+/* Note: No dot files for LBR */
+using namespace std;
+
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
-                           UNUSED const std::string &base) {
-    // No impl
-}
-
-void nfaExecLbrVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
-                            UNUSED const std::string &base) {
-    // No impl
-}
-
-void nfaExecLbrNVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
-                             UNUSED const std::string &base) {
-    // No impl
-}
-
-void nfaExecLbrShuf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
-                            UNUSED const std::string &base) {
-    // No impl
-}
-
-void nfaExecLbrTruf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
-                            UNUSED const std::string &base) {
-    // No impl
-}
-
 static
 void lbrDumpCommon(const lbr_common *lc, FILE *f) {
     const RepeatInfo *info
@@ -88,60 +67,80 @@ void lbrDumpCommon(const lbr_common *lc, FILE *f) {
     fprintf(f, "min period: %u\n", info->minPeriod);
 }
 
-void nfaExecLbrDot_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
     assert(nfa);
-    assert(nfa->type == LBR_NFA_Dot);
+    assert(nfa->type == LBR_NFA_DOT);
     const lbr_dot *ld = (const lbr_dot *)getImplNfa(nfa);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
     lbrDumpCommon(&ld->common, f);
     fprintf(f, "DOT model\n");
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
-void nfaExecLbrVerm_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
     assert(nfa);
-    assert(nfa->type == LBR_NFA_Verm);
+    assert(nfa->type == LBR_NFA_VERM);
     const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
+
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     lbrDumpCommon(&lv->common, f);
     fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
-void nfaExecLbrNVerm_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
     assert(nfa);
-    assert(nfa->type == LBR_NFA_NVerm);
+    assert(nfa->type == LBR_NFA_NVERM);
     const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
+
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     lbrDumpCommon(&lv->common, f);
     fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
-void nfaExecLbrShuf_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
     assert(nfa);
-    assert(nfa->type == LBR_NFA_Shuf);
+    assert(nfa->type == LBR_NFA_SHUF);
+
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa);
     lbrDumpCommon(&ls->common, f);
 
-    CharReach cr = shufti2cr(ls->mask_lo, ls->mask_hi);
+    CharReach cr = shufti2cr((const u8 *)&ls->mask_lo,
+                             (const u8 *)&ls->mask_hi);
     fprintf(f, "SHUF model, scanning for: %s (%zu chars)\n",
             describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
-void nfaExecLbrTruf_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
     assert(nfa);
-    assert(nfa->type == LBR_NFA_Truf);
+    assert(nfa->type == LBR_NFA_TRUF);
+
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa);
     lbrDumpCommon(&lt->common, f);
 
-    CharReach cr = truffle2cr(lt->mask1, lt->mask2);
+    CharReach cr = truffle2cr((const u8 *)&lt->mask1,
+                              (const u8 *)&lt->mask2);
     fprintf(f, "TRUFFLE model, scanning for: %s (%zu chars)\n",
             describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/lbr_dump.h b/src/nfa/lbr_dump.h
index 06ed51e2..ea4e3f38 100644
--- a/src/nfa/lbr_dump.h
+++ b/src/nfa/lbr_dump.h
@@ -31,28 +31,17 @@
 
 #ifdef DUMP_SUPPORT
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(const struct NFA *nfa, FILE *file,
-                           const std::string &base);
-void nfaExecLbrVerm_dumpDot(const struct NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecLbrNVerm_dumpDot(const struct NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecLbrShuf_dumpDot(const struct NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecLbrTruf_dumpDot(const struct NFA *nfa, FILE *file,
-                            const std::string &base);
-void nfaExecLbrDot_dumpText(const struct NFA *nfa, FILE *file);
-void nfaExecLbrVerm_dumpText(const struct NFA *nfa, FILE *file);
-void nfaExecLbrNVerm_dumpText(const struct NFA *nfa, FILE *file);
-void nfaExecLbrTruf_dumpText(const struct NFA *nfa, FILE *file);
-void nfaExecLbrShuf_dumpText(const struct NFA *nfa, FILE *file);
+void nfaExecLbrDot_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecLbrVerm_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecLbrNVerm_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecLbrShuf_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecLbrTruf_dump(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index ad53503c..0223604d 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -41,9 +41,7 @@ extern "C"
 #define GENERATE_NFA_DUMP_DECL(gf_name)                                        \
     } /* extern "C" */                                                         \
     namespace ue2 {                                                            \
-    void gf_name##_dumpDot(const struct NFA *nfa, FILE *file,                  \
-                           const std::string &base);                           \
-    void gf_name##_dumpText(const struct NFA *nfa, FILE *file);                \
+        void gf_name##_dump(const struct NFA *nfa, const std::string &base);   \
     } /* namespace ue2 */                                                      \
     extern "C" {
 
@@ -77,6 +75,7 @@ extern "C"
     GENERATE_NFA_DUMP_DECL(gf_name)
 
 GENERATE_NFA_DECL(nfaExecLimEx32)
+GENERATE_NFA_DECL(nfaExecLimEx64)
 GENERATE_NFA_DECL(nfaExecLimEx128)
 GENERATE_NFA_DECL(nfaExecLimEx256)
 GENERATE_NFA_DECL(nfaExecLimEx384)
diff --git a/src/nfa/limex_64.c b/src/nfa/limex_64.c
new file mode 100644
index 00000000..e8f0880b
--- /dev/null
+++ b/src/nfa/limex_64.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief LimEx NFA: 128-bit SIMD runtime implementations.
+ */
+
+/* Limex64 is unusual on as on 32 bit platforms, at runtime it uses an m128 for
+ * state calculations.
+ */
+
+//#define DEBUG_INPUT
+//#define DEBUG_EXCEPTIONS
+
+#include "limex.h"
+
+#include "accel.h"
+#include "limex_internal.h"
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+// Common code
+#define STATE_ON_STACK
+#define ESTATE_ON_STACK
+
+#include "limex_runtime.h"
+
+#define SIZE          64
+#define ENG_STATE_T   u64a
+
+#ifdef ARCH_64_BIT
+#define STATE_T       u64a
+#define LOAD_FROM_ENG load_u64a
+#else
+#define STATE_T       m128
+#define LOAD_FROM_ENG load_m128_from_u64a
+#endif
+
+#include "limex_exceptional.h"
+
+#include "limex_state_impl.h"
+
+#define INLINE_ATTR really_inline
+#include "limex_common_impl.h"
+
+#include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index 28f37083..c74c7079 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -78,10 +78,26 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
 size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = packedExtract32(s, accel);
+    u32 idx = pext32(s, accel);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
+#ifdef ARCH_64_BIT
+size_t doAccel64(u64a s, u64a accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end) {
+    u32 idx = pext64(s, accel);
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+#else
+size_t doAccel64(m128 s, m128 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end) {
+    u32 idx = pext64(movq(s), movq(accel));
+    return accelScanWrapper(accelTable, aux, input, idx, i, end);
+}
+#endif
+
 size_t doAccel128(const m128 *state, const struct LimExNFA128 *limex,
                   const u8 *accelTable, const union AccelAux *aux,
                   const u8 *input, size_t i, size_t end) {
diff --git a/src/nfa/limex_accel.h b/src/nfa/limex_accel.h
index 173df759..e5c94e82 100644
--- a/src/nfa/limex_accel.h
+++ b/src/nfa/limex_accel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #include "util/simd_utils.h" // for m128 etc
 
 union AccelAux;
+struct LimExNFA64;
 struct LimExNFA128;
 struct LimExNFA256;
 struct LimExNFA384;
@@ -49,6 +50,16 @@ size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end);
 
+#ifdef ARCH_64_BIT
+size_t doAccel64(u64a s, u64a accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end);
+#else
+size_t doAccel64(m128 s, m128 accel, const u8 *accelTable,
+                 const union AccelAux *aux, const u8 *input, size_t i,
+                 size_t end);
+#endif
+
 size_t doAccel128(const m128 *s, const struct LimExNFA128 *limex,
                   const u8 *accelTable, const union AccelAux *aux,
                   const u8 *input, size_t i, size_t end);
diff --git a/src/nfa/limex_common_impl.h b/src/nfa/limex_common_impl.h
index 9523b073..e441945d 100644
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@@ -31,14 +31,14 @@
 
 /* impl of limex functions which depend only on state size */
 
-#if !defined(SIZE) || !defined(STATE_T) || !defined(INLINE_ATTR)
-#  error Must define SIZE and STATE_T and INLINE_ATTR in includer.
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) \
+    || !defined(INLINE_ATTR)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG and INLINE_ATTR in includer.
 #endif
 
 #define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
 
 #define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
-#define TESTEOD_REV_FN      JOIN(moNfaRevTestEod, SIZE)
 #define LIMEX_INACCEPT_FN   JOIN(limexInAccept, SIZE)
 #define LIMEX_INANYACCEPT_FN   JOIN(limexInAnyAccept, SIZE)
 #define EXPIRE_ESTATE_FN    JOIN(limexExpireExtendedState, SIZE)
@@ -46,12 +46,11 @@
 #define INITIAL_FN          JOIN(moNfaInitial, SIZE)
 #define TOP_FN              JOIN(moNfaTop, SIZE)
 #define TOPN_FN             JOIN(moNfaTopN, SIZE)
+#define PROCESS_ACCEPTS_IMPL_FN  JOIN(moProcessAcceptsImpl, SIZE)
 #define PROCESS_ACCEPTS_FN  JOIN(moProcessAccepts, SIZE)
 #define PROCESS_ACCEPTS_NOSQUASH_FN  JOIN(moProcessAcceptsNoSquash, SIZE)
 #define CONTEXT_T           JOIN(NFAContext, SIZE)
 #define ONES_STATE          JOIN(ones_, STATE_T)
-#define LOAD_STATE          JOIN(load_, STATE_T)
-#define STORE_STATE         JOIN(store_, STATE_T)
 #define AND_STATE           JOIN(and_, STATE_T)
 #define OR_STATE            JOIN(or_, STATE_T)
 #define ANDNOT_STATE        JOIN(andnot_, STATE_T)
@@ -62,6 +61,20 @@
 #define SQUASH_UNTUG_BR_FN  JOIN(lazyTug, SIZE)
 #define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE)
 
+#if defined(ARCH_64_BIT) && (SIZE >= 64)
+#define CHUNK_T u64a
+#define FIND_AND_CLEAR_FN findAndClearLSB_64
+#define POPCOUNT_FN popcount64
+#define RANK_IN_MASK_FN rank_in_mask64
+#else
+#define CHUNK_T u32
+#define FIND_AND_CLEAR_FN findAndClearLSB_32
+#define POPCOUNT_FN popcount32
+#define RANK_IN_MASK_FN rank_in_mask32
+#endif
+
+#define NUM_STATE_CHUNKS (sizeof(STATE_T) / sizeof(CHUNK_T))
+
 static really_inline
 void SQUASH_UNTUG_BR_FN(const IMPL_NFA_T *limex,
                         const union RepeatControl *repeat_ctrl,
@@ -83,7 +96,7 @@ void SQUASH_UNTUG_BR_FN(const IMPL_NFA_T *limex,
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
 
         u32 cyclicState = info->cyclicState;
-        if (!TESTBIT_STATE(accstate, cyclicState)) {
+        if (!TESTBIT_STATE(*accstate, cyclicState)) {
             continue;
         }
 
@@ -100,70 +113,85 @@ void SQUASH_UNTUG_BR_FN(const IMPL_NFA_T *limex,
     }
 }
 
-static never_inline
-char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s,
-                        const struct NFAAccept *acceptTable, u32 acceptCount,
-                        u64a offset, NfaCallback callback, void *context) {
+static really_inline
+char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
+                             STATE_T *squash, const STATE_T *acceptMask,
+                             const struct NFAAccept *acceptTable, u64a offset,
+                             NfaCallback callback, void *context) {
     assert(s);
     assert(limex);
     assert(callback);
-    assert(acceptCount);
 
-    // We have squash masks we might have to apply after firing reports.
-    STATE_T squash = ONES_STATE;
-    const STATE_T *squashMasks = (const STATE_T *)
-        ((const char *)limex + limex->squashOffset);
+    const STATE_T accept_mask = *acceptMask;
+    STATE_T accepts = AND_STATE(*s, accept_mask);
 
-    for (u32 i = 0; i < acceptCount; i++) {
-        const struct NFAAccept *a = &acceptTable[i];
-        if (TESTBIT_STATE(s, a->state)) {
-            DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
-                         a->state, a->externalId, offset);
-            int rv = callback(0, offset, a->externalId, context);
+    // Caller must ensure that we have at least one accept state on.
+    assert(ISNONZERO_STATE(accepts));
+
+    CHUNK_T chunks[NUM_STATE_CHUNKS];
+    memcpy(chunks, &accepts, sizeof(accepts));
+
+    CHUNK_T mask_chunks[NUM_STATE_CHUNKS];
+    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
+
+    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
+        CHUNK_T chunk = chunks[i];
+        while (chunk != 0) {
+            u32 bit = FIND_AND_CLEAR_FN(&chunk);
+            u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit);
+            u32 idx = local_idx + base_index;
+            const struct NFAAccept *a = &acceptTable[idx];
+            DEBUG_PRINTF("state %u: firing report list=%u, offset=%llu\n",
+                         bit + i * (u32)sizeof(chunk) * 8, a->reports, offset);
+            int rv = limexRunAccept((const char *)limex, a, callback, context,
+                                    offset);
             if (unlikely(rv == MO_HALT_MATCHING)) {
                 return 1;
             }
-            if (a->squash != MO_INVALID_IDX) {
-                assert(a->squash < limex->squashCount);
-                const STATE_T *sq = &squashMasks[a->squash];
-                DEBUG_PRINTF("squash mask %u @ %p\n", a->squash, sq);
-                squash = AND_STATE(squash, LOAD_STATE(sq));
+            if (squash != NULL && a->squash != MO_INVALID_IDX) {
+                DEBUG_PRINTF("applying squash mask at offset %u\n", a->squash);
+                const ENG_STATE_T *sq =
+                    (const ENG_STATE_T *)((const char *)limex + a->squash);
+                *squash = AND_STATE(*squash, LOAD_FROM_ENG(sq));
             }
         }
+        base_index += POPCOUNT_FN(mask_chunks[i]);
     }
 
-    STORE_STATE(s, AND_STATE(LOAD_STATE(s), squash));
     return 0;
 }
 
 static never_inline
-char PROCESS_ACCEPTS_NOSQUASH_FN(const STATE_T *s,
-                                 const struct NFAAccept *acceptTable,
-                                 u32 acceptCount, u64a offset,
-                                 NfaCallback callback, void *context) {
-    assert(s);
-    assert(callback);
-    assert(acceptCount);
+char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s,
+                        const STATE_T *acceptMask,
+                        const struct NFAAccept *acceptTable, u64a offset,
+                        NfaCallback callback, void *context) {
+    // We have squash masks we might have to apply after firing reports.
+    STATE_T squash = ONES_STATE;
+    return PROCESS_ACCEPTS_IMPL_FN(limex, s, &squash, acceptMask, acceptTable,
+                                   offset, callback, context);
 
-    for (u32 i = 0; i < acceptCount; i++) {
-        const struct NFAAccept *a = &acceptTable[i];
-        if (TESTBIT_STATE(s, a->state)) {
-            DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
-                         a->state, a->externalId, offset);
-            int rv = callback(0, offset, a->externalId, context);
-            if (unlikely(rv == MO_HALT_MATCHING)) {
-                return 1;
-            }
-        }
-    }
-    return 0;
+    *s = AND_STATE(*s, squash);
 }
 
-// Run EOD accepts.
+static never_inline
+char PROCESS_ACCEPTS_NOSQUASH_FN(const IMPL_NFA_T *limex, const STATE_T *s,
+                                 const STATE_T *acceptMask,
+                                 const struct NFAAccept *acceptTable,
+                                 u64a offset, NfaCallback callback,
+                                 void *context) {
+    STATE_T *squash = NULL;
+    return PROCESS_ACCEPTS_IMPL_FN(limex, s, squash, acceptMask, acceptTable,
+                                   offset, callback, context);
+}
+
+// Run EOD accepts. Note that repeat_ctrl and repeat_state may be NULL if this
+// LimEx contains no repeat structures.
 static really_inline
 char TESTEOD_FN(const IMPL_NFA_T *limex, const STATE_T *s,
                 const union RepeatControl *repeat_ctrl,
-                const char *repeat_state, u64a offset, char do_br,
+                const char *repeat_state, u64a offset,
                 NfaCallback callback, void *context) {
     assert(limex && s);
 
@@ -172,47 +200,16 @@ char TESTEOD_FN(const IMPL_NFA_T *limex, const STATE_T *s,
         return MO_CONTINUE_MATCHING;
     }
 
-    const STATE_T acceptEodMask = LOAD_STATE(&limex->acceptAtEOD);
-    STATE_T foundAccepts = AND_STATE(LOAD_STATE(s), acceptEodMask);
+    const STATE_T acceptEodMask = LOAD_FROM_ENG(&limex->acceptAtEOD);
+    STATE_T foundAccepts = AND_STATE(*s, acceptEodMask);
 
-    if (do_br) {
-        SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state,
-                           offset + 1 /* EOD 'symbol' */, &foundAccepts);
-    } else {
-        assert(!limex->repeatCount);
-    }
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state,
+                       offset + 1 /* EOD 'symbol' */, &foundAccepts);
 
     if (unlikely(ISNONZERO_STATE(foundAccepts))) {
         const struct NFAAccept *acceptEodTable = getAcceptEodTable(limex);
-        if (PROCESS_ACCEPTS_NOSQUASH_FN(&foundAccepts, acceptEodTable,
-                                        limex->acceptEodCount, offset, callback,
-                                        context)) {
-            return MO_HALT_MATCHING;
-        }
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-static really_inline
-char TESTEOD_REV_FN(const IMPL_NFA_T *limex, const STATE_T *s, u64a offset,
-                    NfaCallback callback, void *context) {
-    assert(limex && s);
-
-    // There may not be any EOD accepts in this NFA.
-    if (!limex->acceptEodCount) {
-        return MO_CONTINUE_MATCHING;
-    }
-
-    STATE_T acceptEodMask = LOAD_STATE(&limex->acceptAtEOD);
-    STATE_T foundAccepts = AND_STATE(LOAD_STATE(s), acceptEodMask);
-
-    assert(!limex->repeatCount);
-
-    if (unlikely(ISNONZERO_STATE(foundAccepts))) {
-        const struct NFAAccept *acceptEodTable = getAcceptEodTable(limex);
-        if (PROCESS_ACCEPTS_NOSQUASH_FN(&foundAccepts, acceptEodTable,
-                                        limex->acceptEodCount, offset, callback,
+        if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptEodMask,
+                                        acceptEodTable, offset, callback,
                                         context)) {
             return MO_HALT_MATCHING;
         }
@@ -228,8 +225,8 @@ char REPORTCURRENT_FN(const IMPL_NFA_T *limex, const struct mq *q) {
     assert(q->state);
     assert(q_cur_type(q) == MQE_START);
 
-    STATE_T s = LOAD_STATE(q->state);
-    STATE_T acceptMask = LOAD_STATE(&limex->accept);
+    STATE_T s = *(STATE_T *)q->state;
+    STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
     STATE_T foundAccepts = AND_STATE(s, acceptMask);
 
     if (unlikely(ISNONZERO_STATE(foundAccepts))) {
@@ -238,8 +235,8 @@ char REPORTCURRENT_FN(const IMPL_NFA_T *limex, const struct mq *q) {
         const struct NFAAccept *acceptTable = getAcceptTable(limex);
         u64a offset = q_cur_offset(q);
 
-        if (PROCESS_ACCEPTS_NOSQUASH_FN(&foundAccepts, acceptTable,
-                                        limex->acceptCount, offset, q->cb,
+        if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptMask,
+                                        acceptTable, offset, q->cb,
                                         q->context)) {
             return MO_HALT_MATCHING;
         }
@@ -250,7 +247,7 @@ char REPORTCURRENT_FN(const IMPL_NFA_T *limex, const struct mq *q) {
 
 static really_inline
 STATE_T INITIAL_FN(const IMPL_NFA_T *impl, char onlyDs) {
-    return LOAD_STATE(onlyDs ? &impl->initDS : &impl->init);
+    return LOAD_FROM_ENG(onlyDs ? &impl->initDS : &impl->init);
 }
 
 static really_inline
@@ -261,9 +258,9 @@ STATE_T TOP_FN(const IMPL_NFA_T *impl, char onlyDs, STATE_T state) {
 static really_inline
 STATE_T TOPN_FN(const IMPL_NFA_T *limex, STATE_T state, u32 n) {
     assert(n < limex->topCount);
-    const STATE_T *topsptr =
-        (const STATE_T *)((const char *)limex + limex->topOffset);
-    STATE_T top = LOAD_STATE(&topsptr[n]);
+    const ENG_STATE_T *topsptr =
+        (const ENG_STATE_T *)((const char *)limex + limex->topOffset);
+    STATE_T top = LOAD_FROM_ENG(&topsptr[n]);
     return OR_STATE(top, state);
 }
 
@@ -279,8 +276,8 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
 
     DEBUG_PRINTF("expire estate at offset %llu\n", offset);
 
-    const STATE_T cyclics =
-        AND_STATE(LOAD_STATE(&ctx->s), LOAD_STATE(&limex->repeatCyclicMask));
+    const STATE_T cyclics
+        = AND_STATE(ctx->s, LOAD_FROM_ENG(&limex->repeatCyclicMask));
     if (ISZERO_STATE(cyclics)) {
         DEBUG_PRINTF("no cyclic states are on\n");
         return;
@@ -290,7 +287,7 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
 
         u32 cyclicState = info->cyclicState;
-        if (!TESTBIT_STATE(&cyclics, cyclicState)) {
+        if (!TESTBIT_STATE(cyclics, cyclicState)) {
             continue;
         }
 
@@ -310,14 +307,14 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
                      last_top, repeat->repeatMax);
         u64a adj = 0;
         /* if the cycle's tugs are active at repeat max, it is still alive */
-        if (TESTBIT_STATE((const STATE_T *)&limex->accept, cyclicState) ||
-            TESTBIT_STATE((const STATE_T *)&limex->acceptAtEOD, cyclicState)) {
+        if (TESTBIT_STATE(LOAD_FROM_ENG(&limex->accept), cyclicState) ||
+            TESTBIT_STATE(LOAD_FROM_ENG(&limex->acceptAtEOD), cyclicState)) {
             DEBUG_PRINTF("lazy tug possible - may still be inspected\n");
             adj = 1;
         } else {
-            const STATE_T *tug_mask =
-                (const STATE_T *)((const char *)info + info->tugMaskOffset);
-            if (ISNONZERO_STATE(AND_STATE(ctx->s, LOAD_STATE(tug_mask)))) {
+            const ENG_STATE_T *tug_mask =
+                (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset);
+            if (ISNONZERO_STATE(AND_STATE(ctx->s, LOAD_FROM_ENG(tug_mask)))) {
                 DEBUG_PRINTF("tug possible - may still be inspected\n");
                 adj = 1;
             }
@@ -339,37 +336,45 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
                        u64a offset, ReportID report) {
     assert(limex);
 
-    const STATE_T acceptMask = LOAD_STATE(&limex->accept);
-    STATE_T accstate = AND_STATE(state, acceptMask);
+    const STATE_T accept_mask = LOAD_FROM_ENG(&limex->accept);
+    STATE_T accepts = AND_STATE(state, accept_mask);
 
     // Are we in an accept state?
-    if (ISZERO_STATE(accstate)) {
+    if (ISZERO_STATE(accepts)) {
         DEBUG_PRINTF("no accept states are on\n");
         return 0;
     }
 
-    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accstate);
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accepts);
 
     DEBUG_PRINTF("looking for report %u\n", report);
 
-#ifdef DEBUG
-    DEBUG_PRINTF("accept states that are on: ");
-    for (u32 i = 0; i < sizeof(STATE_T) * 8; i++) {
-        if (TESTBIT_STATE(&accstate, i)) printf("%u ", i);
-    }
-    printf("\n");
-#endif
-
-    // Does one of our states match the given report ID?
     const struct NFAAccept *acceptTable = getAcceptTable(limex);
-    for (u32 i = 0; i < limex->acceptCount; i++) {
-        const struct NFAAccept *a = &acceptTable[i];
-        DEBUG_PRINTF("checking idx=%u, externalId=%u\n", a->state,
-                     a->externalId);
-        if (a->externalId == report && TESTBIT_STATE(&accstate, a->state)) {
-            DEBUG_PRINTF("report is on!\n");
-            return 1;
+
+    CHUNK_T chunks[NUM_STATE_CHUNKS];
+    memcpy(chunks, &accepts, sizeof(accepts));
+
+    CHUNK_T mask_chunks[NUM_STATE_CHUNKS];
+    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));
+
+    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
+        CHUNK_T chunk = chunks[i];
+        while (chunk != 0) {
+            u32 bit = FIND_AND_CLEAR_FN(&chunk);
+            u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit);
+            u32 idx = local_idx + base_index;
+            assert(idx < limex->acceptCount);
+            const struct NFAAccept *a = &acceptTable[idx];
+            DEBUG_PRINTF("state %u is on, report list at %u\n",
+                         bit + i * (u32)sizeof(chunk) * 8, a->reports);
+
+            if (limexAcceptHasReport((const char *)limex, a, report)) {
+                DEBUG_PRINTF("report %u is on\n", report);
+                return 1;
+            }
         }
+        base_index += POPCOUNT_FN(mask_chunks[i]);
     }
 
     return 0;
@@ -381,7 +386,7 @@ char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
                           u64a offset) {
     assert(limex);
 
-    const STATE_T acceptMask = LOAD_STATE(&limex->accept);
+    const STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
     STATE_T accstate = AND_STATE(state, acceptMask);
 
     // Are we in an accept state?
@@ -396,7 +401,6 @@ char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
 }
 
 #undef TESTEOD_FN
-#undef TESTEOD_REV_FN
 #undef REPORTCURRENT_FN
 #undef EXPIRE_ESTATE_FN
 #undef LIMEX_INACCEPT_FN
@@ -407,8 +411,6 @@ char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
 #undef CONTEXT_T
 #undef IMPL_NFA_T
 #undef ONES_STATE
-#undef LOAD_STATE
-#undef STORE_STATE
 #undef AND_STATE
 #undef OR_STATE
 #undef ANDNOT_STATE
@@ -416,11 +418,14 @@ char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
 #undef TESTBIT_STATE
 #undef ISNONZERO_STATE
 #undef ISZERO_STATE
+#undef PROCESS_ACCEPTS_IMPL_FN
 #undef PROCESS_ACCEPTS_FN
 #undef PROCESS_ACCEPTS_NOSQUASH_FN
 #undef SQUASH_UNTUG_BR_FN
 #undef GET_NFA_REPEAT_INFO_FN
 
-#undef SIZE
-#undef STATE_T
-#undef INLINE_ATTR
+#undef CHUNK_T
+#undef FIND_AND_CLEAR_FN
+#undef POPCOUNT_FN
+#undef RANK_IN_MASK_FN
+#undef NUM_STATE_CHUNKS
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 77754e0b..ba4d0f0d 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -37,10 +37,10 @@
 #include "limex_internal.h"
 #include "limex_limits.h"
 #include "nfa_build_util.h"
+#include "nfagraph/ng_dominators.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_limex_accel.h"
 #include "nfagraph/ng_repeat.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_squash.h"
 #include "nfagraph/ng_util.h"
 #include "ue2common.h"
@@ -64,12 +64,21 @@
 #include <map>
 #include <set>
 #include <vector>
+
 #include <boost/graph/breadth_first_search.hpp>
+#include <boost/range/adaptor/map.hpp>
 
 using namespace std;
+using boost::adaptors::map_values;
 
 namespace ue2 {
 
+/**
+ * \brief Special state index value meaning that the vertex will not
+ * participate in an (NFA/DFA/etc) implementation.
+ */
+static constexpr u32 NO_STATE = ~0;
+
 namespace {
 
 struct precalcAccel {
@@ -87,7 +96,7 @@ struct precalcAccel {
 struct limex_accel_info {
     ue2::unordered_set<NFAVertex> accelerable;
     map<NFAStateSet, precalcAccel> precalc;
-    ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends;
+    ue2::unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
     ue2::unordered_map<NFAVertex, AccelScheme> accel_map;
 };
 
@@ -130,7 +139,7 @@ struct build_info {
                const vector<BoundedRepeatData> &ri,
                const map<NFAVertex, NFAStateSet> &rsmi,
                const map<NFAVertex, NFAStateSet> &smi,
-               const map<u32, NFAVertex> &ti, const set<NFAVertex> &zi,
+               const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
                bool dai, bool sci, const CompileContext &cci,
                u32 nsi)
         : h(hi), state_ids(states_in), repeats(ri), tops(ti), zombies(zi),
@@ -156,7 +165,7 @@ struct build_info {
     map<NFAVertex, NFAStateSet> reportSquashMap;
     map<NFAVertex, NFAStateSet> squashMap;
 
-    const map<u32, NFAVertex> &tops;
+    const map<u32, set<NFAVertex>> &tops;
     ue2::unordered_set<NFAVertex> tugs;
     map<NFAVertex, BoundedRepeatSummary> br_cyclic;
     const set<NFAVertex> &zombies;
@@ -485,7 +494,7 @@ void nfaFindAccelSchemes(const NGHolder &g,
         // We want to skip any vertices that don't lead to at least one other
         // (self-loops don't count) vertex.
         if (!has_proper_successor(v, g)) {
-            DEBUG_PRINTF("skipping vertex %u\n", g[v].index);
+            DEBUG_PRINTF("skipping vertex %zu\n", g[v].index);
             continue;
         }
 
@@ -493,7 +502,7 @@ void nfaFindAccelSchemes(const NGHolder &g,
 
         AccelScheme as;
         if (nfaCheckAccel(g, v, refined_cr, br_cyclic, &as, allow_wide)) {
-            DEBUG_PRINTF("graph vertex %u is accelerable with offset %u.\n",
+            DEBUG_PRINTF("graph vertex %zu is accelerable with offset %u.\n",
                           g[v].index, as.offset);
             (*out)[v] = as;
         }
@@ -505,7 +514,7 @@ struct fas_visitor : public boost::default_bfs_visitor {
                 ue2::unordered_map<NFAVertex, AccelScheme> *out_in)
         : accel_map(am_in), out(out_in) {}
 
-    void discover_vertex(NFAVertex v, const NFAGraph &) {
+    void discover_vertex(NFAVertex v, const NGHolder &) {
         if (accel_map.find(v) != accel_map.end()) {
             (*out)[v] = accel_map.find(v)->second;
         }
@@ -518,36 +527,40 @@ struct fas_visitor : public boost::default_bfs_visitor {
 };
 
 static
-void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
+void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
                        ue2::unordered_map<NFAVertex, AccelScheme> *accel_map) {
     /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
      * else should be ditched. We use a simple BFS to choose accel states near
      * the start. */
 
-    // Temporarily wire start to each top for the BFS.
-    vector<NFAEdge> topEdges;
-    wireStartToTops(g, tops, topEdges);
+    vector<NFAEdge> tempEdges;
+    for (const auto &vv : tops | map_values) {
+        for (NFAVertex v : vv) {
+            if (!edge(g.start, v, g).second) {
+                tempEdges.push_back(add_edge(g.start, v, g).first);
+            }
+        }
+    }
 
     // Similarly, connect (start, startDs) if necessary.
     if (!edge(g.start, g.startDs, g).second) {
-        auto e = add_edge(g.start, g.startDs, g).first;
-        topEdges.push_back(e); // Remove edge later.
+        NFAEdge e = add_edge(g.start, g.startDs, g);
+        tempEdges.push_back(e); // Remove edge later.
     }
 
     ue2::unordered_map<NFAVertex, AccelScheme> out;
 
     try {
         vector<boost::default_color_type> colour(num_vertices(g));
-        breadth_first_search(
-            g.g, g.start,
+        boost::breadth_first_search(g, g.start,
             visitor(fas_visitor(*accel_map, &out))
-                .color_map(make_iterator_property_map(
-                    colour.begin(), get(&NFAGraphVertexProps::index, g.g))));
+                .color_map(make_iterator_property_map(colour.begin(),
+                                                      get(vertex_index, g))));
     } catch (fas_visitor *) {
         ; /* found max accel_states */
     }
 
-    remove_edges(topEdges, g);
+    remove_edges(tempEdges, g);
 
     assert(out.size() <= NFA_MAX_ACCEL_STATES);
     accel_map->swap(out);
@@ -614,7 +627,7 @@ void fillAccelInfo(build_info &bi) {
 
     /* for each subset of the accel keys need to find an accel scheme */
     assert(astates.size() < 32);
-    sort(astates.begin(), astates.end(), make_index_ordering(g));
+    sort(astates.begin(), astates.end());
 
     for (u32 i = 1, i_end = 1U << astates.size(); i < i_end; i++) {
         DEBUG_PRINTF("saving info for accel %u\n", i);
@@ -701,9 +714,157 @@ void fillAccelInfo(build_info &bi) {
 
 /** The AccelAux structure has large alignment specified, and this makes some
  * compilers do odd things unless we specify a custom allocator. */
-typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)> >
+typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>
     AccelAuxVector;
 
+#define IMPOSSIBLE_ACCEL_MASK (~0U)
+
+static
+u32 getEffectiveAccelStates(const build_info &args,
+                            u32 active_accel_mask,
+                            const vector<AccelBuild> &accelStates) {
+    /* accelStates is indexed by the acceleration bit index and contains a
+     * reference to the original vertex & state_id */
+
+    /* Cases to consider:
+     *
+     * 1: Accel states a and b are on and b can squash a
+     *    --> we can ignore a. This will result in a no longer being accurately
+     *        modelled - we may miss escapes turning it off and we may also miss
+     *        its successors being activated.
+     *
+     * 2: Accel state b is on but accel state a is off and a is .* and must be
+     *    seen before b is reached (and would not be covered by (1))
+     *    --> if a is squashable (or may die unexpectedly) we should continue
+     *        as is
+     *    --> if a is not squashable we can treat this as a+b or as a no accel,
+     *        impossible case
+     *    --> this case could be extended to handle non dot reaches by
+     *        effectively creating something similar to squash masks for the
+     *        reverse graph
+     *
+     *
+     * Other cases:
+     *
+     * 3: Accel states a and b are on but have incompatible reaches
+     *    --> we should treat this as an impossible case. Actually, this case
+     *        is unlikely to arise as we pick states with wide reaches to
+     *        accelerate so an empty intersection is unlikely.
+     *
+     * Note: we need to be careful when dealing with accel states corresponding
+     * to bounded repeat cyclics - they may 'turn off' based on a max bound and
+     * so we may still require on earlier states to be accurately modelled.
+     */
+    const NGHolder &h = args.h;
+    auto dom_map = findDominators(h);
+
+    /* map from accel_id to mask of accel_ids that it is dominated by */
+    vector<u32> dominated_by(accelStates.size());
+
+    map<NFAVertex, u32> accel_id_map;
+    for (u32 accel_id = 0; accel_id < accelStates.size(); accel_id++) {
+        NFAVertex v = accelStates[accel_id].v;
+        accel_id_map[v] = accel_id;
+    }
+
+    /* Note: we want a slightly less strict defn of dominate as skip edges
+     * prevent .* 'truly' dominating */
+    for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
+        u32 accel_id = findAndClearLSB_32(&local_accel_mask);
+        assert(accel_id < accelStates.size());
+        NFAVertex v = accelStates[accel_id].v;
+        while (dom_map[v]) {
+            v = dom_map[v];
+            if (contains(accel_id_map, v)) {
+                dominated_by[accel_id] |= 1U << accel_id_map[v];
+            }
+            /* TODO: could also look at inv_adj vertices to handle fan-in */
+            for (NFAVertex a : adjacent_vertices_range(v, h)) {
+                if (a == v || !contains(accel_id_map, a)
+                    || a == accelStates[accel_id].v /* not likely */) {
+                    continue;
+                }
+                if (!is_subset_of(h[v].reports, h[a].reports)) {
+                    continue;
+                }
+                auto v_succ = succs(v, h);
+                auto a_succ = succs(a, h);
+                if (is_subset_of(v_succ, a_succ)) {
+                    dominated_by[accel_id] |= 1U << accel_id_map[a];
+                }
+            }
+        }
+    }
+
+    u32 may_turn_off = 0; /* BR with max bound, non-dots, squashed, etc */
+    for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
+        u32 accel_id = findAndClearLSB_32(&local_accel_mask);
+        NFAVertex v = accelStates[accel_id].v;
+        u32 state_id = accelStates[accel_id].state;
+        assert(contains(args.accel.accelerable, v));
+        if (!h[v].char_reach.all()) {
+            may_turn_off |= 1U << accel_id;
+            continue;
+        }
+        if (contains(args.br_cyclic, v)
+            && args.br_cyclic.at(v).repeatMax != depth::infinity()) {
+            may_turn_off |= 1U << accel_id;
+            continue;
+        }
+        for (const auto &s_mask : args.squashMap | map_values) {
+            if (!s_mask.test(state_id)) {
+                may_turn_off |= 1U << accel_id;
+                break;
+            }
+        }
+        for (const auto &s_mask : args.reportSquashMap | map_values) {
+            if (!s_mask.test(state_id)) {
+                may_turn_off |= 1U << accel_id;
+                break;
+            }
+        }
+    }
+
+    /* Case 1: */
+    u32 ignored = 0;
+    for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
+        u32 accel_id_b = findAndClearLSB_32(&local_accel_mask);
+        NFAVertex v = accelStates[accel_id_b].v;
+        if (!contains(args.squashMap, v)) {
+            continue;
+        }
+        assert(!contains(args.br_cyclic, v)
+               || args.br_cyclic.at(v).repeatMax == depth::infinity());
+        NFAStateSet squashed = args.squashMap.at(v);
+        squashed.flip(); /* default sense for mask of survivors */
+
+        for (u32 local_accel_mask2 = active_accel_mask; local_accel_mask2; ) {
+            u32 accel_id_a = findAndClearLSB_32(&local_accel_mask2);
+            if (squashed.test(accelStates[accel_id_a].state)) {
+                ignored |= 1U << accel_id_a;
+            }
+        }
+    }
+
+    /* Case 2: */
+    for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
+        u32 accel_id = findAndClearLSB_32(&local_accel_mask);
+
+        u32 stuck_dominators = dominated_by[accel_id] & ~may_turn_off;
+        if ((stuck_dominators & active_accel_mask) != stuck_dominators) {
+            DEBUG_PRINTF("only %08x on, but we require %08x\n",
+                         active_accel_mask, stuck_dominators);
+            return IMPOSSIBLE_ACCEL_MASK;
+        }
+    }
+
+    if (ignored) {
+        DEBUG_PRINTF("in %08x, ignoring %08x\n", active_accel_mask, ignored);
+    }
+
+    return active_accel_mask & ~ignored;
+}
+
 static
 void buildAccel(const build_info &args, NFAStateSet &accelMask,
                 NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
@@ -735,11 +896,22 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
     // Set up a unioned AccelBuild for every possible combination of the set
     // bits in accelStates.
     vector<AccelBuild> accelOuts(accelCount);
+    vector<u32> effective_accel_set;
+    effective_accel_set.push_back(0); /* empty is effectively empty */
+
     for (u32 i = 1; i < accelCount; i++) {
-        for (u32 j = 0, j_end = accelStates.size(); j < j_end; j++) {
-            if (i & (1U << j)) {
-                combineAccel(accelStates[j], accelOuts[i]);
-            }
+        u32 effective_i = getEffectiveAccelStates(args, i, accelStates);
+        effective_accel_set.push_back(effective_i);
+
+        if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
+            DEBUG_PRINTF("this combination of accel states is not possible\n");
+            accelOuts[i].stop1 = CharReach::dot();
+            continue;
+        }
+
+        while (effective_i) {
+            u32 base_accel_state = findAndClearLSB_32(&effective_i);
+            combineAccel(accelStates[base_accel_state], accelOuts[i]);
         }
         minimiseAccel(accelOuts[i]);
     }
@@ -759,29 +931,32 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
     for (u32 i = 1; i < accelCount; i++) {
         memset(&aux, 0, sizeof(aux));
 
-        NFAStateSet states(args.num_states);
-        for (u32 j = 0; j < accelStates.size(); j++) {
-            if (i & (1U << j)) {
-                states.set(accelStates[j].state);
-            }
-        }
+        NFAStateSet effective_states(args.num_states);
+        u32 effective_i = effective_accel_set[i];
 
         AccelInfo ainfo;
         ainfo.double_offset = accelOuts[i].offset;
         ainfo.double_stop1 = accelOuts[i].stop1;
         ainfo.double_stop2 = accelOuts[i].stop2;
 
-        if (contains(accel.precalc, states)) {
-            const precalcAccel &precalc = accel.precalc.at(states);
-            if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
-                ainfo.ma_len1 = precalc.ma_info.len1;
-                ainfo.ma_len2 = precalc.ma_info.len2;
-                ainfo.multiaccel_offset = precalc.ma_info.offset;
-                ainfo.multiaccel_stops = precalc.ma_info.cr;
-                ainfo.ma_type = precalc.ma_info.type;
-            } else {
-                ainfo.single_offset = precalc.single_offset;
-                ainfo.single_stops = precalc.single_cr;
+        if (effective_i != IMPOSSIBLE_ACCEL_MASK) {
+            while (effective_i) {
+                u32 base_accel_id = findAndClearLSB_32(&effective_i);
+                effective_states.set(accelStates[base_accel_id].state);
+            }
+
+            if (contains(accel.precalc, effective_states)) {
+                const auto &precalc = accel.precalc.at(effective_states);
+                if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
+                    ainfo.ma_len1 = precalc.ma_info.len1;
+                    ainfo.ma_len2 = precalc.ma_info.len2;
+                    ainfo.multiaccel_offset = precalc.ma_info.offset;
+                    ainfo.multiaccel_stops = precalc.ma_info.cr;
+                    ainfo.ma_type = precalc.ma_info.type;
+                } else {
+                    ainfo.single_offset = precalc.single_offset;
+                    ainfo.single_stops = precalc.single_cr;
+                }
             }
         }
 
@@ -824,14 +999,105 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
 }
 
 static
-void buildAccepts(const build_info &args, NFAStateSet &acceptMask,
-                  NFAStateSet &acceptEodMask, vector<NFAAccept> &accepts,
-                  vector<NFAAccept> &acceptsEod, vector<NFAStateSet> &squash) {
+u32 addSquashMask(const build_info &args, const NFAVertex &v,
+                  vector<NFAStateSet> &squash) {
+    auto sit = args.reportSquashMap.find(v);
+    if (sit == args.reportSquashMap.end()) {
+        return MO_INVALID_IDX;
+    }
+
+    // This state has a squash mask. Paw through the existing vector to
+    // see if we've already seen it, otherwise add a new one.
+    auto it = find(squash.begin(), squash.end(), sit->second);
+    if (it != squash.end()) {
+        return verify_u32(distance(squash.begin(), it));
+    }
+    u32 idx = verify_u32(squash.size());
+    squash.push_back(sit->second);
+    return idx;
+}
+
+static
+u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
+               unordered_map<vector<ReportID>, u32> &reportListCache) {
+    assert(!r.empty());
+
+    vector<ReportID> my_reports(begin(r), end(r));
+    my_reports.push_back(MO_INVALID_IDX); // sentinel
+
+    auto cache_it = reportListCache.find(my_reports);
+    if (cache_it != end(reportListCache)) {
+        u32 offset = cache_it->second;
+        DEBUG_PRINTF("reusing cached report list at %u\n", offset);
+        return offset;
+    }
+
+    auto it = search(begin(reports), end(reports), begin(my_reports),
+                     end(my_reports));
+    if (it != end(reports)) {
+        u32 offset = verify_u32(distance(begin(reports), it));
+        DEBUG_PRINTF("reusing found report list at %u\n", offset);
+        return offset;
+    }
+
+    u32 offset = verify_u32(reports.size());
+    insert(&reports, reports.end(), my_reports);
+    reportListCache.emplace(move(my_reports), offset);
+    return offset;
+}
+
+static
+void buildAcceptsList(const build_info &args,
+                      unordered_map<vector<ReportID>, u32> &reports_cache,
+                      vector<NFAVertex> &verts, vector<NFAAccept> &accepts,
+                      vector<ReportID> &reports, vector<NFAStateSet> &squash) {
+    if (verts.empty()) {
+        return;
+    }
+
+    DEBUG_PRINTF("building accept lists for %zu states\n", verts.size());
+
+    auto cmp_state_id = [&args](NFAVertex a, NFAVertex b) {
+        u32 a_state = args.state_ids.at(a);
+        u32 b_state = args.state_ids.at(b);
+        assert(a_state != b_state || a == b);
+        return a_state < b_state;
+    };
+
+    sort(begin(verts), end(verts), cmp_state_id);
+
+    const NGHolder &h = args.h;
+    for (const auto &v : verts) {
+        DEBUG_PRINTF("state=%u, reports: [%s]\n", args.state_ids.at(v),
+                     as_string_list(h[v].reports).c_str());
+        NFAAccept a;
+        memset(&a, 0, sizeof(a));
+        assert(!h[v].reports.empty());
+        if (h[v].reports.size() == 1) {
+            a.single_report = 1;
+            a.reports = *h[v].reports.begin();
+        } else {
+            a.single_report = 0;
+            a.reports = addReports(h[v].reports, reports, reports_cache);
+        }
+        a.squash = addSquashMask(args, v, squash);
+        accepts.push_back(move(a));
+    }
+}
+
+static
+void buildAccepts(const build_info &args,
+                  unordered_map<vector<ReportID>, u32> &reports_cache,
+                  NFAStateSet &acceptMask, NFAStateSet &acceptEodMask,
+                  vector<NFAAccept> &accepts, vector<NFAAccept> &acceptsEod,
+                  vector<ReportID> &reports, vector<NFAStateSet> &squash) {
     const NGHolder &h = args.h;
 
     acceptMask.resize(args.num_states);
     acceptEodMask.resize(args.num_states);
 
+    vector<NFAVertex> verts_accept, verts_accept_eod;
+
     for (auto v : vertices_range(h)) {
         u32 state_id = args.state_ids.at(v);
 
@@ -839,41 +1105,20 @@ void buildAccepts(const build_info &args, NFAStateSet &acceptMask,
             continue;
         }
 
-        u32 squashMaskOffset = MO_INVALID_IDX;
-        auto sit = args.reportSquashMap.find(v);
-        if (sit != args.reportSquashMap.end()) {
-            // This state has a squash mask. Paw through the existing vector to
-            // see if we've already seen it, otherwise add a new one.
-            auto it = find(squash.begin(), squash.end(), sit->second);
-            if (it != squash.end()) {
-                squashMaskOffset = verify_u32(distance(squash.begin(), it));
-            } else {
-                squashMaskOffset = verify_u32(squash.size());
-                squash.push_back(sit->second);
-            }
-        }
-
-        // Add an accept (or acceptEod) per report ID.
-
-        vector<NFAAccept> *accepts_out;
         if (edge(v, h.accept, h).second) {
             acceptMask.set(state_id);
-            accepts_out = &accepts;
+            verts_accept.push_back(v);
         } else {
             assert(edge(v, h.acceptEod, h).second);
             acceptEodMask.set(state_id);
-            accepts_out = &acceptsEod;
-        }
-
-        for (auto report : h[v].reports) {
-            accepts_out->push_back(NFAAccept());
-            NFAAccept &a = accepts_out->back();
-            a.state = state_id;
-            a.externalId = report;
-            a.squash = squashMaskOffset;
-            DEBUG_PRINTF("Accept: state=%u, externalId=%u\n", state_id, report);
+            verts_accept_eod.push_back(v);
         }
     }
+
+    buildAcceptsList(args, reports_cache, verts_accept, accepts, reports,
+                     squash);
+    buildAcceptsList(args, reports_cache, verts_accept_eod, acceptsEod, reports,
+                     squash);
 }
 
 static
@@ -884,19 +1129,20 @@ void buildTopMasks(const build_info &args, vector<NFAStateSet> &topMasks) {
 
     u32 numMasks = args.tops.rbegin()->first + 1; // max mask index
     DEBUG_PRINTF("we have %u top masks\n", numMasks);
-    assert(numMasks <= NFA_MAX_TOP_MASKS);
 
     topMasks.assign(numMasks, NFAStateSet(args.num_states)); // all zeroes
 
     for (const auto &m : args.tops) {
         u32 mask_idx = m.first;
-        u32 state_id = args.state_ids.at(m.second);
-        DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
+        for (NFAVertex v : m.second) {
+            u32 state_id = args.state_ids.at(v);
+            DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
 
-        assert(mask_idx < numMasks);
-        assert(state_id != NO_STATE);
+            assert(mask_idx < numMasks);
+            assert(state_id != NO_STATE);
 
-        topMasks[mask_idx].set(state_id);
+            topMasks[mask_idx].set(state_id);
+        }
     }
 }
 
@@ -1146,36 +1392,12 @@ struct ExceptionProto {
     }
 };
 
-static
-u32 getReportListIndex(const flat_set<ReportID> &reports,
-                       vector<ReportID> &exceptionReports,
-                       map<vector<ReportID>, u32> &reportListCache) {
-    if (reports.empty()) {
-        return MO_INVALID_IDX;
-    }
-
-    const vector<ReportID> r(reports.begin(), reports.end());
-
-    auto it = reportListCache.find(r);
-    if (it != reportListCache.end()) {
-        u32 idx = it->second;
-        assert(idx < exceptionReports.size());
-        assert(equal(r.begin(), r.end(), exceptionReports.begin() + idx));
-        return idx;
-    }
-
-    u32 idx = verify_u32(exceptionReports.size());
-    reportListCache[r] = idx;
-    exceptionReports.insert(exceptionReports.end(), r.begin(), r.end());
-    exceptionReports.push_back(MO_INVALID_IDX); // terminator
-    return idx;
-}
-
 static
 u32 buildExceptionMap(const build_info &args,
+                      unordered_map<vector<ReportID>, u32> &reports_cache,
                       const ue2::unordered_set<NFAEdge> &exceptional,
-                      map<ExceptionProto, vector<u32> > &exceptionMap,
-                      vector<ReportID> &exceptionReports) {
+                      map<ExceptionProto, vector<u32>> &exceptionMap,
+                      vector<ReportID> &reportList) {
     const NGHolder &h = args.h;
     const u32 num_states = args.num_states;
     u32 exceptionCount = 0;
@@ -1193,10 +1415,6 @@ u32 buildExceptionMap(const build_info &args,
         }
     }
 
-    // We track report lists that have already been written into the global
-    // list in case we can reuse them.
-    map<vector<ReportID>, u32> reportListCache;
-
     for (auto v : vertices_range(h)) {
         const u32 i = args.state_ids.at(v);
 
@@ -1215,8 +1433,12 @@ u32 buildExceptionMap(const build_info &args,
             DEBUG_PRINTF("state %u is exceptional due to accept "
                          "(%zu reports)\n", i, reports.size());
 
-            e.reports_index =
-                getReportListIndex(reports, exceptionReports, reportListCache);
+            if (reports.empty()) {
+                e.reports_index = MO_INVALID_IDX;
+            } else {
+                e.reports_index =
+                    addReports(reports, reportList, reports_cache);
+            }
 
             // We may be applying a report squash too.
             auto mi = args.reportSquashMap.find(v);
@@ -1438,7 +1660,8 @@ struct Factory {
                      sizeof(limex->init), stateSize, repeatscratchStateSize,
                      repeatStreamState);
 
-        size_t scratchStateSize = sizeof(limex->init);
+        size_t scratchStateSize = NFATraits<dtype>::scratch_state_size;
+
         if (repeatscratchStateSize) {
             scratchStateSize
                 = ROUNDUP_N(scratchStateSize, alignof(RepeatControl));
@@ -1641,9 +1864,10 @@ struct Factory {
     }
 
     static
-    void writeExceptions(const map<ExceptionProto, vector<u32> > &exceptionMap,
-                         const vector<u32> &repeatOffsets,
-                         implNFA_t *limex, const u32 exceptionsOffset) {
+    void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap,
+                         const vector<u32> &repeatOffsets, implNFA_t *limex,
+                         const u32 exceptionsOffset,
+                         const u32 reportListOffset) {
         DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
 
         exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
@@ -1670,7 +1894,12 @@ struct Factory {
             exception_t &e = etable[ecount];
             maskSetBits(e.squash, proto.squash_states);
             maskSetBits(e.successors, proto.succ_states);
-            e.reports = proto.reports_index;
+            if (proto.reports_index == MO_INVALID_IDX) {
+                e.reports = MO_INVALID_IDX;
+            } else {
+                e.reports = reportListOffset +
+                            proto.reports_index * sizeof(ReportID);
+            }
             e.hasSquash = verify_u8(proto.squash);
             e.trigger = verify_u8(proto.trigger);
             u32 repeat_offset = proto.repeat_index == MO_INVALID_IDX
@@ -1789,7 +2018,9 @@ struct Factory {
                       const vector<NFAAccept> &acceptsEod,
                       const vector<NFAStateSet> &squash, implNFA_t *limex,
                       const u32 acceptsOffset, const u32 acceptsEodOffset,
-                      const u32 squashOffset) {
+                      const u32 squashOffset, const u32 reportListOffset) {
+        char *limex_base = (char *)limex;
+
         DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
                      acceptsOffset, acceptsEodOffset, squashOffset);
 
@@ -1797,27 +2028,39 @@ struct Factory {
         maskSetBits(limex->accept, acceptMask);
         maskSetBits(limex->acceptAtEOD, acceptEodMask);
 
+        // Transforms the indices (report list, squash mask) into offsets
+        // relative to the base of the limex.
+        auto transform_offset_fn = [&](NFAAccept a) {
+            if (!a.single_report) {
+                a.reports = reportListOffset + a.reports * sizeof(ReportID);
+            }
+            a.squash = squashOffset + a.squash * sizeof(tableRow_t);
+            return a;
+        };
+
         // Write accept table.
         limex->acceptOffset = acceptsOffset;
         limex->acceptCount = verify_u32(accepts.size());
         DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
-        NFAAccept *acceptsTable = (NFAAccept *)((char *)limex + acceptsOffset);
+        NFAAccept *acceptsTable = (NFAAccept *)(limex_base + acceptsOffset);
         assert(ISALIGNED(acceptsTable));
-        copy(accepts.begin(), accepts.end(), acceptsTable);
+        transform(accepts.begin(), accepts.end(), acceptsTable,
+                  transform_offset_fn);
 
         // Write eod accept table.
         limex->acceptEodOffset = acceptsEodOffset;
         limex->acceptEodCount = verify_u32(acceptsEod.size());
         DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
-        NFAAccept *acceptsEodTable = (NFAAccept *)((char *)limex + acceptsEodOffset);
+        NFAAccept *acceptsEodTable = (NFAAccept *)(limex_base + acceptsEodOffset);
         assert(ISALIGNED(acceptsEodTable));
-        copy(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable);
+        transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
+                  transform_offset_fn);
 
         // Write squash mask table.
         limex->squashCount = verify_u32(squash.size());
         limex->squashOffset = squashOffset;
         DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
-        tableRow_t *mask = (tableRow_t *)((char *)limex + squashOffset);
+        tableRow_t *mask = (tableRow_t *)(limex_base + squashOffset);
         assert(ISALIGNED(mask));
         for (size_t i = 0, end = squash.size(); i < end; i++) {
             maskSetBits(mask[i], squash[i]);
@@ -1854,15 +2097,12 @@ struct Factory {
     }
 
     static
-    void writeExceptionReports(const vector<ReportID> &reports,
-                               implNFA_t *limex,
-                               const u32 exceptionReportsOffset) {
-        DEBUG_PRINTF("exceptionReportsOffset=%u\n", exceptionReportsOffset);
-
-        limex->exReportOffset = exceptionReportsOffset;
-        assert(ISALIGNED_N((char *)limex + exceptionReportsOffset,
+    void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
+                         const u32 reportListOffset) {
+        DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
+        assert(ISALIGNED_N((char *)limex + reportListOffset,
                            alignof(ReportID)));
-        copy_bytes((char *)limex + exceptionReportsOffset, reports);
+        copy_bytes((char *)limex + reportListOffset, reports);
     }
 
     static
@@ -1881,16 +2121,21 @@ struct Factory {
             repeatSize += repeats[i].second;
         }
 
+        // We track report lists that have already been written into the global
+        // list in case we can reuse them.
+        unordered_map<vector<ReportID>, u32> reports_cache;
+
         ue2::unordered_set<NFAEdge> exceptional;
         u32 shiftCount = findBestNumOfVarShifts(args);
         assert(shiftCount);
         u32 maxShift = findMaxVarShift(args, shiftCount);
         findExceptionalTransitions(args, exceptional, maxShift);
 
-        map<ExceptionProto, vector<u32> > exceptionMap;
-        vector<ReportID> exceptionReports;
-        u32 exceptionCount = buildExceptionMap(args, exceptional, exceptionMap,
-                                               exceptionReports);
+        map<ExceptionProto, vector<u32>> exceptionMap;
+        vector<ReportID> reportList;
+
+        u32 exceptionCount = buildExceptionMap(args, reports_cache, exceptional,
+                                               exceptionMap, reportList);
 
         assert(exceptionCount <= args.num_states);
 
@@ -1907,8 +2152,8 @@ struct Factory {
         NFAStateSet acceptMask, acceptEodMask;
         vector<NFAAccept> accepts, acceptsEod;
         vector<NFAStateSet> squash;
-        buildAccepts(args, acceptMask, acceptEodMask, accepts, acceptsEod,
-                     squash);
+        buildAccepts(args, reports_cache, acceptMask, acceptEodMask, accepts,
+                     acceptsEod, reportList, squash);
 
         // Build all our accel info.
         NFAStateSet accelMask, accelFriendsMask;
@@ -1949,8 +2194,8 @@ struct Factory {
         const u32 exceptionsOffset = offset;
         offset += sizeof(exception_t) * exceptionCount;
 
-        const u32 exceptionReportsOffset = offset;
-        offset += sizeof(ReportID) * exceptionReports.size();
+        const u32 reportListOffset = offset;
+        offset += sizeof(ReportID) * reportList.size();
 
         const u32 repeatOffsetsOffset = offset;
         offset += sizeof(u32) * args.repeats.size();
@@ -1977,7 +2222,8 @@ struct Factory {
                    limex, accelTableOffset, accelAuxOffset);
 
         writeAccepts(acceptMask, acceptEodMask, accepts, acceptsEod, squash,
-                     limex, acceptsOffset, acceptsEodOffset, squashOffset);
+                     limex, acceptsOffset, acceptsEodOffset, squashOffset,
+                     reportListOffset);
 
         limex->shiftCount = shiftCount;
         writeShiftMasks(args, limex);
@@ -1985,14 +2231,15 @@ struct Factory {
         // Determine the state required for our state vector.
         findStateSize(args, limex);
 
-        writeExceptionReports(exceptionReports, limex, exceptionReportsOffset);
+        writeReportList(reportList, limex, reportListOffset);
 
         // Repeat structures and offset table.
         vector<u32> repeatOffsets;
         writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
                      repeatsOffset);
 
-        writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset);
+        writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset,
+                        reportListOffset);
 
         writeLimexMasks(args, limex);
 
@@ -2021,13 +2268,6 @@ struct Factory {
             sz = 32;
         }
 
-        // Special case: with SIMD available, we definitely prefer using
-        // 128-bit NFAs over 64-bit ones given the paucity of registers
-        // available.
-        if (sz == 64) {
-            sz = 128;
-        }
-
         if (args.cc.grey.nfaForceSize) {
             sz = args.cc.grey.nfaForceSize;
         }
@@ -2067,9 +2307,12 @@ struct scoreNfa {
         typedef u_##mlt_size tableRow_t;                                       \
         typedef NFAException##mlt_size exception_t;                            \
         static const size_t maxStates = mlt_size;                              \
+        static const size_t scratch_state_size = mlt_size == 64 ? sizeof(m128) \
+                                                 : sizeof(tableRow_t);         \
     };
 
 MAKE_LIMEX_TRAITS(32)
+MAKE_LIMEX_TRAITS(64)
 MAKE_LIMEX_TRAITS(128)
 MAKE_LIMEX_TRAITS(256)
 MAKE_LIMEX_TRAITS(384)
@@ -2080,19 +2323,18 @@ MAKE_LIMEX_TRAITS(512)
 #ifndef NDEBUG
 // Some sanity tests, called by an assertion in generate().
 static UNUSED
-bool isSane(const NGHolder &h, const map<u32, NFAVertex> &tops,
+bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
             const ue2::unordered_map<NFAVertex, u32> &state_ids,
             u32 num_states) {
     ue2::unordered_set<u32> seen;
     ue2::unordered_set<NFAVertex> top_starts;
-    for (const auto &m : tops) {
-        top_starts.insert(m.second);
+    for (const auto &vv : tops | map_values) {
+        insert(&top_starts, vv);
     }
 
     for (auto v : vertices_range(h)) {
         if (!contains(state_ids, v)) {
-            DEBUG_PRINTF("no entry for vertex %u in state map\n",
-                         h[v].index);
+            DEBUG_PRINTF("no entry for vertex %zu in state map\n", h[v].index);
             return false;
         }
         const u32 i = state_ids.at(v);
@@ -2100,8 +2342,7 @@ bool isSane(const NGHolder &h, const map<u32, NFAVertex> &tops,
             continue;
         }
 
-        DEBUG_PRINTF("checking vertex %u (state %u)\n", h[v].index,
-                     i);
+        DEBUG_PRINTF("checking vertex %zu (state %u)\n", h[v].index, i);
 
         if (i >= num_states || contains(seen, i)) {
             DEBUG_PRINTF("vertex %u/%u has invalid state\n", i, num_states);
@@ -2111,7 +2352,7 @@ bool isSane(const NGHolder &h, const map<u32, NFAVertex> &tops,
 
         // All our states should be reachable and have a state assigned.
         if (h[v].char_reach.none()) {
-            DEBUG_PRINTF("vertex %u has empty reachability\n", h[v].index);
+            DEBUG_PRINTF("vertex %zu has empty reachability\n", h[v].index);
             return false;
         }
 
@@ -2119,7 +2360,7 @@ bool isSane(const NGHolder &h, const map<u32, NFAVertex> &tops,
         // must have at least one predecessor that is not itself.
         if (v != h.start && v != h.startDs && !contains(top_starts, v)
             && !proper_in_degree(v, h)) {
-            DEBUG_PRINTF("vertex %u has no pred\n", h[v].index);
+            DEBUG_PRINTF("vertex %zu has no pred\n", h[v].index);
             return false;
         }
     }
@@ -2150,7 +2391,7 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
                          const vector<BoundedRepeatData> &repeats,
                          const map<NFAVertex, NFAStateSet> &reportSquashMap,
                          const map<NFAVertex, NFAStateSet> &squashMap,
-                         const map<u32, NFAVertex> &tops,
+                         const map<u32, set<NFAVertex>> &tops,
                          const set<NFAVertex> &zombies,
                          bool do_accel,
                          bool stateCompression,
@@ -2222,7 +2463,7 @@ u32 countAccelStates(NGHolder &h,
                      const vector<BoundedRepeatData> &repeats,
                      const map<NFAVertex, NFAStateSet> &reportSquashMap,
                      const map<NFAVertex, NFAStateSet> &squashMap,
-                     const map<u32, NFAVertex> &tops,
+                     const map<u32, set<NFAVertex>> &tops,
                      const set<NFAVertex> &zombies,
                      const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index 62a07e10..21cb7608 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -71,7 +71,7 @@ aligned_unique_ptr<NFA> generate(NGHolder &g,
                         const std::vector<BoundedRepeatData> &repeats,
                         const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
                         const std::map<NFAVertex, NFAStateSet> &squashMap,
-                        const std::map<u32, NFAVertex> &tops,
+                        const std::map<u32, std::set<NFAVertex>> &tops,
                         const std::set<NFAVertex> &zombies,
                         bool do_accel,
                         bool stateCompression,
@@ -89,7 +89,7 @@ u32 countAccelStates(NGHolder &h,
                      const std::vector<BoundedRepeatData> &repeats,
                      const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
                      const std::map<NFAVertex, NFAStateSet> &squashMap,
-                     const std::map<u32, NFAVertex> &tops,
+                     const std::map<u32, std::set<NFAVertex>> &tops,
                      const std::set<NFAVertex> &zombies,
                      const CompileContext &cc);
 
diff --git a/src/nfa/limex_context.h b/src/nfa/limex_context.h
index 74f22c32..60d20879 100644
--- a/src/nfa/limex_context.h
+++ b/src/nfa/limex_context.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,16 @@
 
 // Runtime context structures.
 
+/* Note: The size of the context structures may vary from platform to platform
+ * (notably, for the Limex64 structure). As a result, information based on the
+ * size and other detail of these structures should not be written into the
+ * bytecode -- really, the details of the structure should not be accessed by
+ * the ue2 compile side at all.
+ */
+#ifdef __cplusplus
+#error ue2 runtime only file
+#endif
+
 /* cached_estate/esucc etc...
  *
  * If the exception state matches the cached_estate we will apply
@@ -66,6 +76,11 @@ struct ALIGN_CL_DIRECTIVE NFAContext##nsize {                               \
 };
 
 GEN_CONTEXT_STRUCT(32,  u32)
+#ifdef ARCH_64_BIT
+GEN_CONTEXT_STRUCT(64,  u64a)
+#else
+GEN_CONTEXT_STRUCT(64,  m128)
+#endif
 GEN_CONTEXT_STRUCT(128, m128)
 GEN_CONTEXT_STRUCT(256, m256)
 GEN_CONTEXT_STRUCT(384, m384)
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index c52adc46..852639ea 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -35,9 +35,10 @@
 #include "limex_internal.h"
 #include "nfa_dump_internal.h"
 #include "ue2common.h"
+#include "util/charreach.h"
 #include "util/dump_charclass.h"
 #include "util/dump_mask.h"
-#include "util/charreach.h"
+#include "util/dump_util.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -70,6 +71,10 @@ template<> struct limex_traits<LimExNFA128> {
     static const u32 size = 128;
     typedef NFAException128 exception_type;
 };
+template<> struct limex_traits<LimExNFA64> {
+    static const u32 size = 64;
+    typedef NFAException64 exception_type;
+};
 template<> struct limex_traits<LimExNFA32> {
     static const u32 size = 32;
     typedef NFAException32 exception_type;
@@ -82,7 +87,7 @@ void dumpMask(FILE *f, const char *name, const u8 *mask, u32 mask_bits) {
 
 template<typename mask_t>
 static
-u32 rank_in_mask(mask_t mask, u32 bit) {
+u32 rank_in_mask(const mask_t &mask, u32 bit) {
     assert(bit < 8 * sizeof(mask));
 
     u32 chunks[sizeof(mask)/sizeof(u32)];
@@ -176,26 +181,40 @@ void dumpAccel(const limex_type *limex, FILE *f) {
     }
 }
 
+static
+void dumpAcceptList(const char *limex_base, const struct NFAAccept *accepts,
+                    u32 acceptCount, FILE *f) {
+    for (u32 i = 0; i < acceptCount; i++) {
+        const NFAAccept &a = accepts[i];
+        if (a.single_report) {
+            fprintf(f, "  idx %u fires single report %u\n", i, a.reports);
+            continue;
+        }
+        fprintf(f, "  idx %u fires report list %u:", i, a.reports);
+        const ReportID *report = (const ReportID *)(limex_base + a.reports);
+        for (; *report != MO_INVALID_IDX; report++) {
+            fprintf(f, " %u", *report);
+        }
+        fprintf(f, "\n");
+    }
+}
+
 template<typename limex_type>
 static
 void dumpAccepts(const limex_type *limex, FILE *f) {
-    u32 acceptCount = limex->acceptCount;
-    u32 acceptEodCount = limex->acceptEodCount;
+    const char *limex_base = (const char *)limex;
+
+    const u32 acceptCount = limex->acceptCount;
+    const u32 acceptEodCount = limex->acceptEodCount;
 
     fprintf(f, "\n%u accepts.\n", acceptCount);
-    const struct NFAAccept *accepts
-        = (const struct NFAAccept *)((const char *)limex + limex->acceptOffset);
-    for (u32 i = 0; i < acceptCount; i++) {
-        fprintf(f, "  state %u fires report %u\n", accepts[i].state,
-                accepts[i].externalId);
-    }
+    const auto *accepts =
+        (const struct NFAAccept *)(limex_base + limex->acceptOffset);
+    dumpAcceptList(limex_base, accepts, acceptCount, f);
     fprintf(f, "\n%u accepts at EOD.\n", acceptEodCount);
-    accepts = (const struct NFAAccept *)((const char *)limex
-                                         + limex->acceptEodOffset);
-    for (u32 i = 0; i < acceptEodCount; i++) {
-        fprintf(f, "  state %u fires report %u\n", accepts[i].state,
-                accepts[i].externalId);
-    }
+    const auto *accepts_eod =
+        (const struct NFAAccept *)(limex_base + limex->acceptEodOffset);
+    dumpAcceptList(limex_base, accepts_eod, acceptEodCount, f);
     fprintf(f, "\n");
 }
 
@@ -222,20 +241,15 @@ getExceptionTable(const limex_type *limex) {
         ((const char *)limex + limex->exceptionOffset);
 }
 
-template<typename limex_type>
-static
-const ReportID *getReportList(const limex_type *limex) {
-    return (const ReportID *)((const char *)limex + limex->exReportOffset);
-}
-
 template<typename limex_type>
 static
 void dumpLimexExceptions(const limex_type *limex, FILE *f) {
     const typename limex_traits<limex_type>::exception_type *e =
                 getExceptionTable(limex);
-    const ReportID *reports = getReportList(limex);
     const u32 size = limex_traits<limex_type>::size;
 
+    const char *limex_base = (const char *)limex;
+
     fprintf(f, "\n");
     for (u32 i = 0; i < limex->exceptionCount; i++) {
         fprintf(f, "exception %u: hasSquash=%u, reports offset=%u\n",
@@ -251,7 +265,7 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
         if (e[i].reports == MO_INVALID_IDX) {
             fprintf(f, " <none>\n");
         } else {
-            const ReportID *r = reports + e[i].reports;
+            const ReportID *r = (const ReportID *)(limex_base + e[i].reports);
             while (*r != MO_INVALID_IDX) {
                 fprintf(f, " %u", *r++);
             }
@@ -459,36 +473,32 @@ void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
     }
 }
 
-#define DUMP_TEXT_FN(ddf_n)                                                    \
-    void nfaExecLimEx##ddf_n##_dumpText(const NFA *nfa, FILE *f) {             \
-        dumpLimexText((const LimExNFA##ddf_n *)getImplNfa(nfa), f);            \
-    }
-
-#define DUMP_DOT_FN(ddf_n)                                                     \
-    void nfaExecLimEx##ddf_n##_dumpDot(const NFA *nfa, FILE *f,                \
-                                       UNUSED const string &base) {            \
-        const LimExNFA##ddf_n *limex =                                         \
-            (const LimExNFA##ddf_n *)getImplNfa(nfa);                          \
+#define LIMEX_DUMP_FN(size)                                                    \
+    void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) {       \
+        auto limex = (const LimExNFA##size *)getImplNfa(nfa);                  \
                                                                                \
+        FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");                \
+        dumpLimexText(limex, f);                                               \
+        fclose(f);                                                             \
+                                                                               \
+        f = fopen_or_throw((base + ".dot").c_str(), "w");                      \
         dumpDotPreamble(f);                                                    \
         u32 state_count = nfa->nPositions;                                     \
         dumpVertexDotInfo(limex, state_count, f,                               \
-                          limex_labeller<LimExNFA##ddf_n>(limex));             \
+                          limex_labeller<LimExNFA##size>(limex));              \
         for (u32 i = 0; i < state_count; i++) {                                \
             dumpLimDotInfo(limex, i, f);                                       \
             dumpExDotInfo(limex, i, f);                                        \
         }                                                                      \
         dumpDotTrailer(f);                                                     \
+        fclose(f);                                                             \
     }
 
-#define LIMEX_DUMP_FNS(size)                                                   \
-    DUMP_TEXT_FN(size)                                                         \
-    DUMP_DOT_FN(size)
-
-LIMEX_DUMP_FNS(32)
-LIMEX_DUMP_FNS(128)
-LIMEX_DUMP_FNS(256)
-LIMEX_DUMP_FNS(384)
-LIMEX_DUMP_FNS(512)
+LIMEX_DUMP_FN(32)
+LIMEX_DUMP_FN(64)
+LIMEX_DUMP_FN(128)
+LIMEX_DUMP_FN(256)
+LIMEX_DUMP_FN(384)
+LIMEX_DUMP_FN(512)
 
 } // namespace ue2
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index 175ca393..e770c327 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -32,8 +32,8 @@
  * X-macro generic impl, included into the various LimEx model implementations.
  */
 
-#if !defined(SIZE) || !defined(STATE_T)
-#  error Must define SIZE and STATE_T in includer.
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
 #endif
 
 #include "config.h"
@@ -44,8 +44,6 @@
 #define PE_FN                   JOIN(processExceptional, SIZE)
 #define RUN_EXCEPTION_FN        JOIN(runException, SIZE)
 #define ZERO_STATE              JOIN(zero_, STATE_T)
-#define LOAD_STATE              JOIN(load_, STATE_T)
-#define STORE_STATE             JOIN(store_, STATE_T)
 #define AND_STATE               JOIN(and_, STATE_T)
 #define EQ_STATE(a, b)          (!JOIN(noteq_, STATE_T)((a), (b)))
 #define OR_STATE                JOIN(or_, STATE_T)
@@ -59,7 +57,7 @@
 #define ESTATE_ARG STATE_T estate
 #else
 #define ESTATE_ARG const STATE_T *estatep
-#define estate LOAD_STATE(estatep)
+#define estate (*estatep)
 #endif
 
 #ifdef STATE_ON_STACK
@@ -97,7 +95,6 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
                      STATE_T *local_succ,
 #endif
                      const struct IMPL_NFA_T *limex,
-                     const ReportID *exReports,
                      u64a offset,
                      struct CONTEXT_T *ctx,
                      struct proto_cache *new_cache,
@@ -133,7 +130,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
         char *repeat_state = ctx->repeat_state + info->stateOffset;
 
         if (e->trigger == LIMEX_TRIGGER_POS) {
-            char cyclic_on = TESTBIT_STATE(STATE_ARG_P, info->cyclicState);
+            char cyclic_on = TESTBIT_STATE(*STATE_ARG_P, info->cyclicState);
             processPosTrigger(repeat, repeat_ctrl, repeat_state, offset,
                               cyclic_on);
             *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
@@ -149,8 +146,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
                 *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
                 DEBUG_PRINTF("stale history, squashing cyclic state\n");
                 assert(e->hasSquash == LIMEX_SQUASH_TUG);
-                STORE_STATE(succ, AND_STATE(LOAD_STATE(succ),
-                            LOAD_STATE(&e->squash)));
+                *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
                 return 1; // continue
             } else if (rv == TRIGGER_SUCCESS_CACHE) {
                 new_cache->br = 1;
@@ -164,7 +160,8 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
     // Some exceptions fire accepts.
     if (e->reports != MO_INVALID_IDX) {
         if (flags & CALLBACK_OUTPUT) {
-            const ReportID *reports = exReports + e->reports;
+            const ReportID *reports =
+                (const ReportID *)((const char *)limex + e->reports);
             if (unlikely(limexRunReports(reports, ctx->callback,
                             ctx->context, offset)
                         == MO_HALT_MATCHING)) {
@@ -188,18 +185,16 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
     // Most exceptions have a set of successors to switch on. `local_succ' is
     // ORed into `succ' at the end of the caller's loop.
 #ifndef BIG_MODEL
-    *local_succ = OR_STATE(*local_succ, LOAD_STATE(&e->successors));
+    *local_succ = OR_STATE(*local_succ, LOAD_FROM_ENG(&e->successors));
 #else
-    STORE_STATE(&ctx->local_succ, OR_STATE(LOAD_STATE(&ctx->local_succ),
-                LOAD_STATE(&e->successors)));
+    ctx->local_succ = OR_STATE(ctx->local_succ, LOAD_FROM_ENG(&e->successors));
 #endif
 
     // Some exceptions squash states behind them. Note that we squash states in
     // 'succ', not local_succ.
-    if (e->hasSquash == LIMEX_SQUASH_CYCLIC ||
-                e->hasSquash == LIMEX_SQUASH_REPORT) {
-        STORE_STATE(succ, AND_STATE(LOAD_STATE(succ),
-                    LOAD_STATE(&e->squash)));
+    if (e->hasSquash == LIMEX_SQUASH_CYCLIC
+        || e->hasSquash == LIMEX_SQUASH_REPORT) {
+        *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
         if (*cacheable == CACHE_RESULT) {
             *cacheable = DO_NOT_CACHE_RESULT;
         }
@@ -215,13 +210,12 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 static really_inline
 int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
           const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
-          const ReportID *exReports, u64a offset, struct CONTEXT_T *ctx,
-          char in_rev, char flags) {
+          u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
     assert(diffmask > 0); // guaranteed by caller macro
 
-    if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
+    if (EQ_STATE(estate, ctx->cached_estate)) {
         DEBUG_PRINTF("using cached succ from previous state\n");
-        STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), LOAD_STATE(&ctx->cached_esucc)));
+        *succ = OR_STATE(*succ, ctx->cached_esucc);
         if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
             DEBUG_PRINTF("firing cached reports from previous state\n");
             if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
@@ -236,7 +230,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #ifndef BIG_MODEL
     STATE_T local_succ = ZERO_STATE;
 #else
-    STORE_STATE(&ctx->local_succ, ZERO_STATE);
+    ctx->local_succ = ZERO_STATE;
 #endif
 
     // A copy of the estate as an array of GPR-sized chunks.
@@ -254,7 +248,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 
     u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
     base_index[0] = 0;
-    for (u32 i = 0; i < ARRAY_LENGTH(base_index) - 1; i++) {
+    for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
         base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
     }
 
@@ -276,31 +270,31 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #ifndef BIG_MODEL
                                   &local_succ,
 #endif
-                                  limex, exReports, offset, ctx, &new_cache,
-                                  &cacheable, in_rev, flags)) {
+                                  limex, offset, ctx, &new_cache, &cacheable,
+                                  in_rev, flags)) {
                 return PE_RV_HALT;
             }
         } while (word);
     } while (diffmask);
 
 #ifndef BIG_MODEL
-    STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), local_succ));
+    *succ = OR_STATE(*succ, local_succ);
 #else
-    STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), ctx->local_succ));
+    *succ = OR_STATE(*succ, ctx->local_succ);
 #endif
 
     if (cacheable == CACHE_RESULT) {
-        STORE_STATE(&ctx->cached_estate, estate);
+        ctx->cached_estate = estate;
 #ifndef BIG_MODEL
         ctx->cached_esucc = local_succ;
 #else
-        STORE_STATE(&ctx->cached_esucc, LOAD_STATE(&ctx->local_succ));
+        ctx->cached_esucc = ctx->local_succ;
 #endif
         ctx->cached_reports = new_cache.reports;
         ctx->cached_br = new_cache.br;
     } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
         if (ctx->cached_br) {
-            STORE_STATE(&ctx->cached_estate, ZERO_STATE);
+            ctx->cached_estate = ZERO_STATE;
         }
     }
 
@@ -314,8 +308,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #undef EQ_STATE
 #undef OR_STATE
 #undef TESTBIT_STATE
-#undef LOAD_STATE
-#undef STORE_STATE
 #undef PE_FN
 #undef RUN_EXCEPTION_FN
 #undef CONTEXT_T
@@ -333,11 +325,9 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #undef STATE_ARG_NAME
 #undef STATE_ARG_P
 
+#undef IMPL_NFA_T
+
 #undef CHUNK_T
 #undef FIND_AND_CLEAR_FN
-#undef IMPL_NFA_T
-#undef GET_NFA_REPEAT_INFO_FN
-
-// Parameters.
-#undef SIZE
-#undef STATE_T
+#undef POPCOUNT_FN
+#undef RANK_IN_MASK_FN
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index c37f5f40..723803c1 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -132,7 +132,6 @@ struct LimExNFA##size {                                                     \
     u32 acceptEodOffset; /* rel. to start of LimExNFA */                    \
     u32 exceptionCount;                                                     \
     u32 exceptionOffset; /* rel. to start of LimExNFA */                    \
-    u32 exReportOffset; /* rel. to start of LimExNFA */                     \
     u32 repeatCount;                                                        \
     u32 repeatOffset;                                                       \
     u32 squashOffset; /* rel. to start of LimExNFA; for accept squashing */ \
@@ -160,6 +159,7 @@ struct LimExNFA##size {                                                     \
 };
 
 CREATE_NFA_LIMEX(32)
+CREATE_NFA_LIMEX(64)
 CREATE_NFA_LIMEX(128)
 CREATE_NFA_LIMEX(256)
 CREATE_NFA_LIMEX(384)
@@ -183,9 +183,16 @@ struct NFARepeatInfo {
 };
 
 struct NFAAccept {
-    u32 state;           //!< state ID of triggering state
-    ReportID externalId; //!< report ID to raise
-    u32 squash;          //!< offset into masks, or MO_INVALID_IDX
+    u8 single_report; //!< If true, 'reports' is report id.
+
+    /**
+     * \brief If single report is true, this is the report id to fire.
+     * Otherwise, it is the offset (relative to the start of the LimExNFA
+     * structure) of a list of reports, terminated with MO_INVALID_IDX.
+     */
+    u32 reports;
+
+    u32 squash;  //!< Offset (from LimEx) into squash masks, or MO_INVALID_IDX.
 };
 
 #endif
diff --git a/src/nfa/limex_limits.h b/src/nfa/limex_limits.h
index 9b35b115..f4df54a4 100644
--- a/src/nfa/limex_limits.h
+++ b/src/nfa/limex_limits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,5 @@
 
 #define NFA_MAX_STATES       512 /**< max states in an NFA */
 #define NFA_MAX_ACCEL_STATES   8 /**< max accel states in a NFA */
-#define NFA_MAX_TOP_MASKS     32 /**< max number of MQE_TOP_N event types */
 
 #endif
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 8a0a8acd..f6f5809c 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -49,12 +49,13 @@
 #include "limex_runtime.h"
 
 // Other implementation code from X-Macro impl.
-#define SIZE 32
-#define STATE_T u32
+#define SIZE          32
+#define STATE_T       u32
+#define ENG_STATE_T   u32
+#define LOAD_FROM_ENG load_u32
+
 #include "limex_state_impl.h"
 
-#define SIZE 32
-#define STATE_T u32
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
@@ -64,8 +65,6 @@
 
 // Process exceptional states
 
-#define SIZE 32
-#define STATE_T u32
 #define STATE_ON_STACK
 #define ESTATE_ON_STACK
 #define RUN_EXCEPTION_FN_ONLY
@@ -74,8 +73,7 @@
 static really_inline
 int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
                          const struct LimExNFA32 *limex,
-                         const struct NFAException32 *exceptions,
-                         const ReportID *exReports, u64a offset,
+                         const struct NFAException32 *exceptions, u64a offset,
                          struct NFAContext32 *ctx, char in_rev, char flags) {
     assert(estate != 0); // guaranteed by calling macro
 
@@ -105,8 +103,8 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
         u32 bit = findAndClearLSB_32(&estate);
         u32 idx = rank_in_mask32(limex->exceptionMask, bit);
         const struct NFAException32 *e = &exceptions[idx];
-        if (!runException32(e, s, succ, &local_succ, limex, exReports, offset,
-                            ctx, &new_cache, &cacheable, in_rev, flags)) {
+        if (!runException32(e, s, succ, &local_succ, limex, offset, ctx,
+                            &new_cache, &cacheable, in_rev, flags)) {
             return PE_RV_HALT;
         }
     } while (estate != 0);
@@ -128,7 +126,4 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
 }
 
 // 32-bit models.
-
-#define SIZE                32
-#define STATE_T             u32
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h
index e0c182fc..6109d382 100644
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -30,8 +30,8 @@
     \brief Limex Execution Engine Or:
     How I Learned To Stop Worrying And Love The Preprocessor
 
-    This file includes utility functions which do not depend on the state size or
-    shift masks directly.
+    This file includes utility functions which do not depend on the size of the
+    state or shift masks directly.
 */
 
 #ifndef LIMEX_RUNTIME_H
@@ -72,41 +72,6 @@ struct proto_cache {
     const ReportID *reports;
 };
 
-// Shift macros for Limited NFAs. Defined in terms of uniform ops.
-// LimExNFAxxx ptr in 'limex' and the current state in 's'
-#define NFA_EXEC_LIM_SHIFT(nels_type, nels_i)                                  \
-    (JOIN(lshift_, nels_type)(                                                 \
-        JOIN(and_, nels_type)(s,                                               \
-                              JOIN(load_, nels_type)(&limex->shift[nels_i])),  \
-        limex->shiftAmount[nels_i]))
-
-// Calculate the (limited model) successors for a number of variable shifts.
-// Assumes current state in 's' and successors in 'succ'.
-
-#define NFA_EXEC_GET_LIM_SUCC(gls_type)                                        \
-    do {                                                                       \
-        succ = NFA_EXEC_LIM_SHIFT(gls_type, 0);                                \
-        switch (limex->shiftCount) {                                           \
-        case 8:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 7)); \
-        case 7:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 6)); \
-        case 6:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 5)); \
-        case 5:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 4)); \
-        case 4:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 3)); \
-        case 3:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 2)); \
-        case 2:                                                                \
-            succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 1)); \
-        case 1:                                                                \
-        case 0:                                                                \
-            ;                                                                  \
-        }                                                                      \
-    } while (0)
-
 #define PE_RV_HALT 1
 
 #ifdef STATE_ON_STACK
@@ -138,14 +103,42 @@ int limexRunReports(const ReportID *reports, NfaCallback callback,
     return MO_CONTINUE_MATCHING; // continue
 }
 
+static really_inline
+int limexRunAccept(const char *limex_base, const struct NFAAccept *accept,
+                   NfaCallback callback, void *context, u64a offset) {
+    if (accept->single_report) {
+        const ReportID report = accept->reports;
+        DEBUG_PRINTF("firing single report for id %u at offset %llu\n", report,
+                     offset);
+        return callback(0, offset, report, context);
+    }
+    const ReportID *reports = (const ReportID *)(limex_base + accept->reports);
+    return limexRunReports(reports, callback, context, offset);
+}
+
+static really_inline
+int limexAcceptHasReport(const char *limex_base, const struct NFAAccept *accept,
+                         ReportID report) {
+    if (accept->single_report) {
+        return accept->reports == report;
+    }
+
+    const ReportID *reports = (const ReportID *)(limex_base + accept->reports);
+    assert(*reports != MO_INVALID_IDX);
+    do {
+        if (*reports == report) {
+            return 1;
+        }
+        reports++;
+    } while (*reports != MO_INVALID_IDX);
+
+    return 0;
+}
+
 /** \brief Return a (correctly typed) pointer to the exception table. */
 #define getExceptionTable(exc_type, lim)                                       \
     ((const exc_type *)((const char *)(lim) + (lim)->exceptionOffset))
 
-/** \brief Return a pointer to the exceptional reports list. */
-#define getExReports(lim)                                                      \
-    ((const ReportID *)((const char *)(lim) + (lim)->exReportOffset))
-
 /** \brief Return a pointer to the ordinary accepts table. */
 #define getAcceptTable(lim)                                                    \
     ((const struct NFAAccept *)((const char *)(lim) + (lim)->acceptOffset))
@@ -170,6 +163,7 @@ int limexRunReports(const ReportID *reports, NfaCallback callback,
     }
 
 MAKE_GET_NFA_REPEAT_INFO(32)
+MAKE_GET_NFA_REPEAT_INFO(64)
 MAKE_GET_NFA_REPEAT_INFO(128)
 MAKE_GET_NFA_REPEAT_INFO(256)
 MAKE_GET_NFA_REPEAT_INFO(384)
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 881e41fd..45ceb2b5 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -29,7 +29,6 @@
 #include "util/join.h"
 #include <string.h>
 
-
 /** \file
   * \brief Limex Execution Engine Or:
   * How I Learned To Stop Worrying And Love The Preprocessor
@@ -37,8 +36,9 @@
   * Version 2.0: now with X-Macros, so you get line numbers in your debugger.
   */
 
-#if !defined(SIZE) || !defined(STATE_T)
-#  error Must define SIZE and STATE_T in includer.
+
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
 #endif
 
 #define LIMEX_API_ROOT   JOIN(nfaExecLimEx, SIZE)
@@ -46,7 +46,6 @@
 #define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
 
 #define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
-#define TESTEOD_REV_FN      JOIN(moNfaRevTestEod, SIZE)
 #define INITIAL_FN          JOIN(moNfaInitial, SIZE)
 #define TOP_FN              JOIN(moNfaTop, SIZE)
 #define TOPN_FN             JOIN(moNfaTopN, SIZE)
@@ -67,11 +66,10 @@
 #define STREAMSILENT_FN     JOIN(LIMEX_API_ROOT, _Stream_Silent)
 #define CONTEXT_T           JOIN(NFAContext, SIZE)
 #define EXCEPTION_T         JOIN(struct NFAException, SIZE)
-#define LOAD_STATE          JOIN(load_, STATE_T)
-#define STORE_STATE         JOIN(store_, STATE_T)
 #define AND_STATE           JOIN(and_, STATE_T)
 #define ANDNOT_STATE        JOIN(andnot_, STATE_T)
 #define OR_STATE            JOIN(or_, STATE_T)
+#define LSHIFT_STATE        JOIN(lshift_, STATE_T)
 #define TESTBIT_STATE       JOIN(testbit_, STATE_T)
 #define CLEARBIT_STATE      JOIN(clearbit_, STATE_T)
 #define ZERO_STATE          JOIN(zero_, STATE_T)
@@ -96,17 +94,16 @@
 #define ACCEL_AND_FRIENDS_MASK  accel_and_friendsMask
 #define EXCEPTION_MASK          exceptionMask
 #else
-#define ACCEL_MASK              LOAD_STATE(&limex->accel)
-#define ACCEL_AND_FRIENDS_MASK  LOAD_STATE(&limex->accel_and_friends)
-#define EXCEPTION_MASK          LOAD_STATE(&limex->exceptionMask)
+#define ACCEL_MASK              LOAD_FROM_ENG(&limex->accel)
+#define ACCEL_AND_FRIENDS_MASK  LOAD_FROM_ENG(&limex->accel_and_friends)
+#define EXCEPTION_MASK          LOAD_FROM_ENG(&limex->exceptionMask)
 #endif
 
 // Run exception processing, if necessary. Returns 0 if scanning should
 // continue, 1 if an accept was fired and the user instructed us to halt.
 static really_inline
 char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
-                       const ReportID *exReports, STATE_T s,
-                       const STATE_T emask, size_t i, u64a offset,
+                       STATE_T s, const STATE_T emask, size_t i, u64a offset,
                        STATE_T *succ, u64a *final_loc, struct CONTEXT_T *ctx,
                        const char flags, const char in_rev,
                        const char first_match) {
@@ -117,13 +114,13 @@ char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
     }
 
     if (first_match && i) {
-        STATE_T acceptMask = LOAD_STATE(&limex->accept);
+        STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
         STATE_T foundAccepts = AND_STATE(s, acceptMask);
         if (unlikely(ISNONZERO_STATE(foundAccepts))) {
             DEBUG_PRINTF("first match at %zu\n", i);
             DEBUG_PRINTF("for nfa %p\n", limex);
             assert(final_loc);
-            STORE_STATE(&ctx->s, s);
+            ctx->s = s;
             *final_loc = i;
             return 1; // Halt matching.
         }
@@ -133,7 +130,7 @@ char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
     char localflags = (!i && !in_rev) ? NO_OUTPUT | FIRST_BYTE : flags;
 
     int rv = JOIN(processExceptional, SIZE)(
-        pass_state, pass_estate, diffmask, succ, limex, exceptions, exReports,
+        pass_state, pass_estate, diffmask, succ, limex, exceptions,
         callback_offset, ctx, in_rev, localflags);
     if (rv == PE_RV_HALT) {
         return 1; // Halt matching.
@@ -161,22 +158,55 @@ size_t RUN_ACCEL_FN(const STATE_T s, UNUSED const STATE_T accelMask,
     return j;
 }
 
+// Shift macros for Limited NFAs. Defined in terms of uniform ops.
+// LimExNFAxxx ptr in 'limex' and the current state in 's'
+#define NFA_EXEC_LIM_SHIFT(limex_m, curr_m, shift_idx)                         \
+    LSHIFT_STATE(AND_STATE(curr_m, LOAD_FROM_ENG(&limex_m->shift[shift_idx])), \
+                 limex_m->shiftAmount[shift_idx])
+
+// Calculate the (limited model) successors for a number of variable shifts.
+// Assumes current state in 'curr_m' and places the successors in 'succ_m'.
+#define NFA_EXEC_GET_LIM_SUCC(limex_m, curr_m, succ_m)                         \
+    do {                                                                       \
+        succ_m = NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 0);                       \
+        switch (limex_m->shiftCount) {                                         \
+        case 8:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 7)); \
+        case 7:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 6)); \
+        case 6:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 5)); \
+        case 5:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 4)); \
+        case 4:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 3)); \
+        case 3:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 2)); \
+        case 2:                                                                \
+            succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 1)); \
+        case 1:                                                                \
+        case 0:                                                                \
+            ;                                                                  \
+        }                                                                      \
+    } while (0)
+
+
 static really_inline
 char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
                struct CONTEXT_T *ctx, u64a offset, const char flags,
                u64a *final_loc, const char first_match) {
-    const STATE_T *reach = (const STATE_T *)((const char *)limex + sizeof(*limex));
+    const ENG_STATE_T *reach = get_reach_table(limex);
 #if SIZE < 256
-    const STATE_T accelMask = LOAD_STATE(&limex->accel);
-    const STATE_T accel_and_friendsMask = LOAD_STATE(&limex->accel_and_friends);
-    const STATE_T exceptionMask = LOAD_STATE(&limex->exceptionMask);
+    const STATE_T accelMask = LOAD_FROM_ENG(&limex->accel);
+    const STATE_T accel_and_friendsMask
+        = LOAD_FROM_ENG(&limex->accel_and_friends);
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
 #endif
     const u8 *accelTable = (const u8 *)((const char *)limex + limex->accelTableOffset);
     const union AccelAux *accelAux =
         (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
-    const ReportID *exReports = getExReports(limex);
-    STATE_T s = LOAD_STATE(&ctx->s);
+    STATE_T s = ctx->s;
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */
@@ -195,21 +225,20 @@ without_accel:
         DUMP_INPUT(i);
         if (ISZERO_STATE(s)) {
             DEBUG_PRINTF("no states are switched on, early exit\n");
-            STORE_STATE(&ctx->s, s);
+            ctx->s = s;
             return MO_CONTINUE_MATCHING;
         }
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T);
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s, EXCEPTION_MASK,
-                              i, offset, &succ, final_loc, ctx, flags, 0,
-                              first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
             return MO_HALT_MATCHING;
         }
 
-        s = AND_STATE(succ, LOAD_STATE(&reach[limex->reachMap[c]]));
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
 with_accel:
@@ -252,33 +281,30 @@ with_accel:
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T);
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,  EXCEPTION_MASK,
-                              i, offset, &succ, final_loc, ctx, flags, 0,
-                              first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
             return MO_HALT_MATCHING;
         }
 
-        s = AND_STATE(succ, LOAD_STATE(&reach[limex->reachMap[c]]));
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
-    STORE_STATE(&ctx->s, s);
+    ctx->s = s;
 
     if ((first_match || (flags & CALLBACK_OUTPUT)) && limex->acceptCount) {
-        STATE_T acceptMask = LOAD_STATE(&limex->accept);
+        STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
         const struct NFAAccept *acceptTable = getAcceptTable(limex);
-        const u32 acceptCount = limex->acceptCount;
-
         STATE_T foundAccepts = AND_STATE(s, acceptMask);
         if (unlikely(ISNONZERO_STATE(foundAccepts))) {
             if (first_match) {
-                STORE_STATE(&ctx->s, s);
+                ctx->s = s;
                 assert(final_loc);
                 *final_loc = length;
                 return MO_HALT_MATCHING;
-            } else if (PROCESS_ACCEPTS_FN(limex, &ctx->s, acceptTable,
-                                          acceptCount, offset + length,
+            } else if (PROCESS_ACCEPTS_FN(limex, &ctx->s, &acceptMask,
+                                          acceptTable, offset + length,
                                           ctx->callback, ctx->context)) {
                 return MO_HALT_MATCHING;
             }
@@ -294,13 +320,12 @@ with_accel:
 static never_inline
 char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
                    struct CONTEXT_T *ctx, u64a offset) {
-    const STATE_T *reach = (const STATE_T *)((const char *)limex + sizeof(*limex));
+    const ENG_STATE_T *reach = get_reach_table(limex);
 #if SIZE < 256
-    const STATE_T exceptionMask = LOAD_STATE(&limex->exceptionMask);
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
 #endif
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
-    const ReportID *exReports = getExReports(limex);
-    STATE_T s = LOAD_STATE(&ctx->s);
+    STATE_T s = ctx->s;
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */
@@ -311,34 +336,33 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
         DUMP_INPUT(i-1);
         if (ISZERO_STATE(s)) {
             DEBUG_PRINTF("no states are switched on, early exit\n");
-            STORE_STATE(&ctx->s, s);
+            ctx->s = s;
             return MO_CONTINUE_MATCHING;
         }
 
         u8 c = input[i-1];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T);
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,
-                              EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
-                              flags, 1, 0)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 1, 0)) {
             return MO_HALT_MATCHING;
         }
 
-        s = AND_STATE(succ, reach[limex->reachMap[c]]);
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
-    STORE_STATE(&ctx->s, s);
+    ctx->s = s;
 
-    STATE_T acceptMask = LOAD_STATE(&limex->accept);
+    STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept);
     const struct NFAAccept *acceptTable = getAcceptTable(limex);
     const u32 acceptCount = limex->acceptCount;
     assert(flags & CALLBACK_OUTPUT);
     if (acceptCount) {
         STATE_T foundAccepts = AND_STATE(s, acceptMask);
         if (unlikely(ISNONZERO_STATE(foundAccepts))) {
-            if (PROCESS_ACCEPTS_NOSQUASH_FN(&ctx->s, acceptTable, acceptCount,
-                                            offset, ctx->callback,
+            if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &ctx->s, &acceptMask,
+                                            acceptTable, offset, ctx->callback,
                                             ctx->context)) {
                 return MO_HALT_MATCHING;
             }
@@ -354,9 +378,9 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
         return;
     }
 
-    STATE_T s = LOAD_STATE(src);
+    STATE_T s = *(STATE_T *)src;
 
-    if (ISZERO_STATE(AND_STATE(s, LOAD_STATE(&limex->repeatCyclicMask)))) {
+    if (ISZERO_STATE(AND_STATE(LOAD_FROM_ENG(&limex->repeatCyclicMask), s))) {
         DEBUG_PRINTF("no cyclics are on\n");
         return;
     }
@@ -369,7 +393,7 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
         DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
 
-        if (!TESTBIT_STATE(&s, info->cyclicState)) {
+        if (!TESTBIT_STATE(s, info->cyclicState)) {
             DEBUG_PRINTF("is dead\n");
             continue;
         }
@@ -388,7 +412,7 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
                    offset);
     }
 
-    STORE_STATE(src, s);
+    *(STATE_T *)src = s;
 }
 
 char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n,
@@ -411,7 +435,7 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
 
     // Note: state has already been expanded into 'dest'.
     const STATE_T cyclics =
-        AND_STATE(LOAD_STATE(dest), LOAD_STATE(&limex->repeatCyclicMask));
+        AND_STATE(*(STATE_T *)dest, LOAD_FROM_ENG(&limex->repeatCyclicMask));
     if (ISZERO_STATE(cyclics)) {
         DEBUG_PRINTF("no cyclics are on\n");
         return;
@@ -425,7 +449,7 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
         DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
 
-        if (!TESTBIT_STATE(&cyclics, info->cyclicState)) {
+        if (!TESTBIT_STATE(cyclics, info->cyclicState)) {
             DEBUG_PRINTF("is dead\n");
             continue;
         }
@@ -447,9 +471,8 @@ char JOIN(LIMEX_API_ROOT, _expandState)(const struct NFA *n, void *dest,
     return 0;
 }
 
-char JOIN(LIMEX_API_ROOT, _queueInitState)(const struct NFA *n,
-                                              struct mq *q) {
-    STORE_STATE(q->state, ZERO_STATE);
+char JOIN(LIMEX_API_ROOT, _queueInitState)(const struct NFA *n, struct mq *q) {
+    *(STATE_T *)q->state = ZERO_STATE;
 
     // Zero every bounded repeat control block in state.
     const IMPL_NFA_T *limex = getImplNfa(n);
@@ -529,7 +552,7 @@ void JOIN(LIMEX_API_ROOT, _HandleEvent)(const IMPL_NFA_T *limex,
     u32 e = q->items[q->cur].type;
     switch (e) {
         DEFINE_CASE(MQE_TOP)
-            STORE_STATE(&ctx->s, TOP_FN(limex, !!sp, LOAD_STATE(&ctx->s)));
+            ctx->s = TOP_FN(limex, !!sp, ctx->s);
             break;
         DEFINE_CASE(MQE_START)
             break;
@@ -539,8 +562,7 @@ void JOIN(LIMEX_API_ROOT, _HandleEvent)(const IMPL_NFA_T *limex,
             assert(e >= MQE_TOP_FIRST);
             assert(e < MQE_INVALID);
             DEBUG_PRINTF("MQE_TOP + %d\n", ((int)e - MQE_TOP_FIRST));
-            STORE_STATE(&ctx->s,
-                        TOPN_FN(limex, LOAD_STATE(&ctx->s), e - MQE_TOP_FIRST));
+            ctx->s = TOPN_FN(limex, ctx->s, e - MQE_TOP_FIRST);
     }
 #undef DEFINE_CASE
 }
@@ -570,12 +592,12 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
     ctx.repeat_state = q->streamState + limex->stateSize;
     ctx.callback = q->cb;
     ctx.context = q->context;
-    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
+    ctx.cached_estate = ZERO_STATE;
     ctx.cached_br = 0;
 
     assert(q->items[q->cur].location >= 0);
     DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
+    ctx.s = *(STATE_T *)q->state;
     assert(q->items[q->cur].type == MQE_START);
 
     u64a offset = q->offset;
@@ -599,7 +621,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
         assert(ep - offset <= q->length);
         if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp)
                 == MO_HALT_MATCHING) {
-            STORE_STATE(q->state, ZERO_STATE);
+            *(STATE_T *)q->state = ZERO_STATE;
             return 0;
         }
 
@@ -616,7 +638,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = sp - offset;
            DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
-           STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+           *(STATE_T *)q->state = ctx.s;
            return MO_ALIVE;
        }
 
@@ -628,7 +650,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
     EXPIRE_ESTATE_FN(limex, &ctx, sp);
 
     DEBUG_PRINTF("END\n");
-    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+    *(STATE_T *)q->state = ctx.s;
 
     if (q->cur != q->end) {
         q->cur--;
@@ -637,7 +659,7 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
         return MO_ALIVE;
     }
 
-    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
+    return ISNONZERO_STATE(ctx.s);
 }
 
 /* used by suffix execution in Rose */
@@ -665,11 +687,11 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
     ctx.repeat_state = q->streamState + limex->stateSize;
     ctx.callback = q->cb;
     ctx.context = q->context;
-    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
+    ctx.cached_estate = ZERO_STATE;
     ctx.cached_br = 0;
 
     DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
+    ctx.s = *(STATE_T *)q->state;
     assert(q->items[q->cur].type == MQE_START);
 
     u64a offset = q->offset;
@@ -699,7 +721,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
                 q->cur--;
                 q->items[q->cur].type = MQE_START;
                 q->items[q->cur].location = sp + final_look - offset;
-                STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+                *(STATE_T *)q->state = ctx.s;
                 return MO_MATCHES_PENDING;
             }
 
@@ -721,7 +743,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
             q->cur--;
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = sp + final_look - offset;
-            STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+            *(STATE_T *)q->state = ctx.s;
             return MO_MATCHES_PENDING;
         }
 
@@ -737,7 +759,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = sp - offset;
             DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end);
-            STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+            *(STATE_T *)q->state = ctx.s;
             return MO_ALIVE;
         }
 
@@ -749,7 +771,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
     EXPIRE_ESTATE_FN(limex, &ctx, sp);
 
     DEBUG_PRINTF("END\n");
-    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+    *(STATE_T *)q->state = ctx.s;
 
     if (q->cur != q->end) {
         q->cur--;
@@ -758,7 +780,7 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
         return MO_ALIVE;
     }
 
-    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
+    return ISNONZERO_STATE(ctx.s);
 }
 
 // Used for execution Rose prefix/infixes.
@@ -777,11 +799,11 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
     ctx.repeat_state = q->streamState + limex->stateSize;
     ctx.callback = NULL;
     ctx.context = NULL;
-    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
+    ctx.cached_estate = ZERO_STATE;
     ctx.cached_br = 0;
 
     DEBUG_PRINTF("LOAD STATE\n");
-    STORE_STATE(&ctx.s, LOAD_STATE(q->state));
+    ctx.s = *(STATE_T *)q->state;
     assert(q->items[q->cur].type == MQE_START);
 
     u64a offset = q->offset;
@@ -793,7 +815,7 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
         if (n->maxWidth) {
             if (ep - sp > n->maxWidth) {
                 sp = ep - n->maxWidth;
-                STORE_STATE(&ctx.s, INITIAL_FN(limex, !!sp));
+                ctx.s = INITIAL_FN(limex, !!sp);
             }
         }
         assert(ep >= sp);
@@ -832,14 +854,14 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
     DEBUG_PRINTF("END, nfa is %s\n",
                  ISNONZERO_STATE(ctx.s) ? "still alive" : "dead");
 
-    STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+    *(STATE_T *)q->state = ctx.s;
 
-    if (JOIN(limexInAccept, SIZE)(limex, LOAD_STATE(&ctx.s), ctx.repeat_ctrl,
+    if (JOIN(limexInAccept, SIZE)(limex, ctx.s, ctx.repeat_ctrl,
                                   ctx.repeat_state, sp + 1, report)) {
         return MO_MATCHES_PENDING;
     }
 
-    return ISNONZERO_STATE(LOAD_STATE(&ctx.s));
+    return ISNONZERO_STATE(ctx.s);
 }
 
 char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
@@ -852,8 +874,8 @@ char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
     const union RepeatControl *repeat_ctrl =
         getRepeatControlBaseConst(state, sizeof(STATE_T));
     const char *repeat_state = streamState + limex->stateSize;
-    return TESTEOD_FN(limex, sptr, repeat_ctrl, repeat_state, offset, 1,
-                      callback, context);
+    return TESTEOD_FN(limex, sptr, repeat_ctrl, repeat_state, offset, callback,
+                      context);
 }
 
 char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
@@ -875,11 +897,11 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
     ctx.repeat_state = NULL;
     ctx.callback = cb;
     ctx.context = context;
-    STORE_STATE(&ctx.cached_estate, ZERO_STATE);
+    ctx.cached_estate = ZERO_STATE;
     ctx.cached_br = 0;
 
     const IMPL_NFA_T *limex = getImplNfa(n);
-    STORE_STATE(&ctx.s, INITIAL_FN(limex, 0)); // always anchored
+    ctx.s = INITIAL_FN(limex, 0); // always anchored
 
     // 'buf' may be null, for example when we're scanning at EOD time.
     if (buflen) {
@@ -896,8 +918,11 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
         REV_STREAM_FN(limex, hbuf, hlen, &ctx, offset);
     }
 
-    if (offset == 0 && ISNONZERO_STATE(LOAD_STATE(&ctx.s))) {
-        TESTEOD_REV_FN(limex, &ctx.s, offset, cb, context);
+    if (offset == 0 && limex->acceptEodCount && ISNONZERO_STATE(ctx.s)) {
+        const union RepeatControl *repeat_ctrl = NULL;
+        const char *repeat_state = NULL;
+        TESTEOD_FN(limex, &ctx.s, repeat_ctrl, repeat_state, offset, cb,
+                   context);
     }
 
     // NOTE: return value is unused.
@@ -913,7 +938,7 @@ char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
     union RepeatControl *repeat_ctrl =
         getRepeatControlBase(q->state, sizeof(STATE_T));
     char *repeat_state = q->streamState + limex->stateSize;
-    STATE_T state = LOAD_STATE(q->state);
+    STATE_T state = *(STATE_T *)q->state;
     u64a offset = q->offset + q_last_loc(q) + 1;
 
     return JOIN(limexInAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
@@ -928,7 +953,7 @@ char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
     union RepeatControl *repeat_ctrl =
         getRepeatControlBase(q->state, sizeof(STATE_T));
     char *repeat_state = q->streamState + limex->stateSize;
-    STATE_T state = LOAD_STATE(q->state);
+    STATE_T state = *(STATE_T *)q->state;
     u64a offset = q->offset + q_last_loc(q) + 1;
 
     return JOIN(limexInAnyAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
@@ -941,8 +966,8 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
                                                          s64a loc) {
     assert(nfa->flags & NFA_ZOMBIE);
     const IMPL_NFA_T *limex = getImplNfa(nfa);
-    STATE_T state = LOAD_STATE(q->state);
-    STATE_T zmask = LOAD_STATE(&limex->zombieMask);
+    STATE_T state = *(STATE_T *)q->state;
+    STATE_T zmask = LOAD_FROM_ENG(&limex->zombieMask);
 
     if (limex->repeatCount) {
         u64a offset = q->offset + loc + 1;
@@ -960,7 +985,6 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 }
 
 #undef TESTEOD_FN
-#undef TESTEOD_REV_FN
 #undef INITIAL_FN
 #undef TOP_FN
 #undef TOPN_FN
@@ -981,11 +1005,10 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef STREAMSILENT_FN
 #undef CONTEXT_T
 #undef EXCEPTION_T
-#undef LOAD_STATE
-#undef STORE_STATE
 #undef AND_STATE
 #undef ANDNOT_STATE
 #undef OR_STATE
+#undef LSHIFT_STATE
 #undef TESTBIT_STATE
 #undef CLEARBIT_STATE
 #undef ZERO_STATE
@@ -999,8 +1022,4 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef ACCEL_MASK
 #undef ACCEL_AND_FRIENDS_MASK
 #undef EXCEPTION_MASK
-
-// Parameters.
-#undef SIZE
-#undef STATE_T
 #undef LIMEX_API_ROOT
diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h
index e45e4331..5ca8fce0 100644
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -41,52 +41,6 @@
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 
-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
-#define HAVE_PEXT
-#endif
-
-static really_inline
-u32 packedExtract32(u32 x, u32 mask) {
-#if defined(HAVE_PEXT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u32(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_32(&mask);
-        if (x & (1U << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
-}
-
-static really_inline
-u32 packedExtract64(u64a x, u64a mask) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u64(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_64(&mask);
-        if (x & (1ULL << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
-}
-
-#undef HAVE_PEXT
-
 static really_inline
 u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
     m128 shuffled = pshufb(s, permute);
diff --git a/src/nfa/limex_simd128.c b/src/nfa/limex_simd128.c
index f0fb1dd4..c5f2b33e 100644
--- a/src/nfa/limex_simd128.c
+++ b/src/nfa/limex_simd128.c
@@ -48,19 +48,16 @@
 
 #include "limex_runtime.h"
 
-#define SIZE 128
-#define STATE_T m128
+#define SIZE          128
+#define STATE_T       m128
+#define ENG_STATE_T   m128
+#define LOAD_FROM_ENG load_m128
+
 #include "limex_exceptional.h"
 
-#define SIZE 128
-#define STATE_T m128
 #include "limex_state_impl.h"
 
-#define SIZE 128
-#define STATE_T m128
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE 128
-#define STATE_T m128
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd256.c b/src/nfa/limex_simd256.c
index 57648b69..cc232908 100644
--- a/src/nfa/limex_simd256.c
+++ b/src/nfa/limex_simd256.c
@@ -45,19 +45,16 @@
 // Common code
 #include "limex_runtime.h"
 
-#define SIZE 256
-#define STATE_T m256
+#define SIZE          256
+#define STATE_T       m256
+#define ENG_STATE_T   m256
+#define LOAD_FROM_ENG load_m256
+
 #include "limex_exceptional.h"
 
-#define SIZE 256
-#define STATE_T m256
 #include "limex_state_impl.h"
 
-#define SIZE 256
-#define STATE_T m256
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE 256
-#define STATE_T m256
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd384.c b/src/nfa/limex_simd384.c
index 84061f61..7e596e48 100644
--- a/src/nfa/limex_simd384.c
+++ b/src/nfa/limex_simd384.c
@@ -45,19 +45,16 @@
 // Common code
 #include "limex_runtime.h"
 
-#define SIZE 384
-#define STATE_T m384
+#define SIZE          384
+#define STATE_T       m384
+#define ENG_STATE_T   m384
+#define LOAD_FROM_ENG load_m384
+
 #include "limex_exceptional.h"
 
-#define SIZE 384
-#define STATE_T m384
 #include "limex_state_impl.h"
 
-#define SIZE 384
-#define STATE_T m384
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE 384
-#define STATE_T m384
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd512.c b/src/nfa/limex_simd512.c
index a6646d83..f779f335 100644
--- a/src/nfa/limex_simd512.c
+++ b/src/nfa/limex_simd512.c
@@ -45,19 +45,16 @@
 // Common code
 #include "limex_runtime.h"
 
-#define SIZE 512
-#define STATE_T m512
+#define SIZE          512
+#define STATE_T       m512
+#define ENG_STATE_T   m512
+#define LOAD_FROM_ENG load_m512
+
 #include "limex_exceptional.h"
 
-#define SIZE 512
-#define STATE_T m512
 #include "limex_state_impl.h"
 
-#define SIZE 512
-#define STATE_T m512
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE 512
-#define STATE_T m512
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_state_impl.h b/src/nfa/limex_state_impl.h
index d6e89904..81153f71 100644
--- a/src/nfa/limex_state_impl.h
+++ b/src/nfa/limex_state_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,8 +35,8 @@
 #include "util/state_compress.h"
 #include <string.h>
 
-#if !defined(SIZE) || !defined(STATE_T)
-#  error Must define SIZE and STATE_T in includer.
+#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG)
+#  error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer.
 #endif
 
 #define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
@@ -44,29 +44,33 @@
 #define REACHMASK_FN        JOIN(moNfaReachMask, SIZE)
 #define COMPRESS_FN         JOIN(moNfaCompressState, SIZE)
 #define EXPAND_FN           JOIN(moNfaExpandState, SIZE)
-#define COMPRESSED_STORE_FN JOIN(storecompressed, SIZE)
-#define COMPRESSED_LOAD_FN  JOIN(loadcompressed, SIZE)
+#define COMPRESSED_STORE_FN JOIN(store_compressed_, STATE_T)
+#define COMPRESSED_LOAD_FN  JOIN(load_compressed_, STATE_T)
 #define PARTIAL_STORE_FN    JOIN(partial_store_, STATE_T)
 #define PARTIAL_LOAD_FN     JOIN(partial_load_, STATE_T)
-#define LOAD_STATE          JOIN(load_, STATE_T)
-#define STORE_STATE         JOIN(store_, STATE_T)
 #define OR_STATE            JOIN(or_, STATE_T)
 #define AND_STATE           JOIN(and_, STATE_T)
 #define ISZERO_STATE        JOIN(isZero_, STATE_T)
 
 static really_inline
-const STATE_T *REACHMASK_FN(const IMPL_NFA_T *limex, const u8 key) {
-    const STATE_T *reach
-        = (const STATE_T *)((const char *)limex + sizeof(*limex));
-    assert(ISALIGNED_N(reach, alignof(STATE_T)));
-    return &reach[limex->reachMap[key]];
+const ENG_STATE_T *get_reach_table(const IMPL_NFA_T *limex) {
+    const ENG_STATE_T *reach
+        = (const ENG_STATE_T *)((const char *)limex + sizeof(*limex));
+    assert(ISALIGNED_N(reach, alignof(ENG_STATE_T)));
+    return reach;
+}
+
+static really_inline
+STATE_T REACHMASK_FN(const IMPL_NFA_T *limex, const u8 key) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+    return LOAD_FROM_ENG(&reach[limex->reachMap[key]]);
 }
 
 static really_inline
 void COMPRESS_FN(const IMPL_NFA_T *limex, u8 *dest, const STATE_T *src,
                  u8 key) {
     assert(ISALIGNED_N(src, alignof(STATE_T)));
-    STATE_T a_src = LOAD_STATE(src);
+    STATE_T a_src = *src;
 
     DEBUG_PRINTF("compress state: %p -> %p\n", src, dest);
 
@@ -77,31 +81,30 @@ void COMPRESS_FN(const IMPL_NFA_T *limex, u8 *dest, const STATE_T *src,
     } else {
         DEBUG_PRINTF("compress state, key=%hhx\n", key);
 
-        const STATE_T *reachmask = REACHMASK_FN(limex, key);
+        STATE_T reachmask = REACHMASK_FN(limex, key);
 
         // Masked compression means that we mask off the initDs states and
         // provide a shortcut for the all-zeroes case. Note that these must be
         // switched on in the EXPAND call below.
         if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
-            STATE_T s = AND_STATE(LOAD_STATE(&limex->compressMask), a_src);
+            STATE_T s = AND_STATE(LOAD_FROM_ENG(&limex->compressMask), a_src);
             if (ISZERO_STATE(s)) {
                 DEBUG_PRINTF("after compression mask, all states are zero\n");
                 memset(dest, 0, limex->stateSize);
                 return;
             }
 
-            STATE_T mask = AND_STATE(LOAD_STATE(&limex->compressMask),
-                                     LOAD_STATE(reachmask));
+            STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask),
+                                     reachmask);
             COMPRESSED_STORE_FN(dest, &s, &mask, limex->stateSize);
         } else {
-            COMPRESSED_STORE_FN(dest, src, reachmask, limex->stateSize);
+            COMPRESSED_STORE_FN(dest, src, &reachmask, limex->stateSize);
         }
     }
 }
 
 static really_inline
-void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src,
-               u8 key) {
+void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src, u8 key) {
     assert(ISALIGNED_N(dest, alignof(STATE_T)));
     DEBUG_PRINTF("expand state: %p -> %p\n", src, dest);
 
@@ -111,16 +114,15 @@ void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src,
         *dest = PARTIAL_LOAD_FN(src, limex->stateSize);
     } else {
         DEBUG_PRINTF("expand state, key=%hhx\n", key);
-        const STATE_T *reachmask = REACHMASK_FN(limex, key);
+        STATE_T reachmask = REACHMASK_FN(limex, key);
 
         if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
-            STATE_T mask = AND_STATE(LOAD_STATE(&limex->compressMask),
-                                     LOAD_STATE(reachmask));
+            STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask),
+                                     reachmask);
             COMPRESSED_LOAD_FN(dest, src, &mask, limex->stateSize);
-            STORE_STATE(dest, OR_STATE(LOAD_STATE(&limex->initDS),
-                        LOAD_STATE(dest)));
+            *dest = OR_STATE(LOAD_FROM_ENG(&limex->initDS), *dest);
         } else {
-            COMPRESSED_LOAD_FN(dest, src, reachmask, limex->stateSize);
+            COMPRESSED_LOAD_FN(dest, src, &reachmask, limex->stateSize);
         }
     }
 }
@@ -134,11 +136,6 @@ void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src,
 #undef COMPRESSED_LOAD_FN
 #undef PARTIAL_STORE_FN
 #undef PARTIAL_LOAD_FN
-#undef LOAD_STATE
-#undef STORE_STATE
 #undef OR_STATE
 #undef AND_STATE
 #undef ISZERO_STATE
-
-#undef SIZE
-#undef STATE_T
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index 88da27c0..ceedb9db 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,10 +42,10 @@
 
 static really_inline
 char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
-                     u16 s, u64a loc, char eod, u16 *const cached_accept_state,
-                     u32 *const cached_accept_id) {
-    DEBUG_PRINTF("reporting state = %hu, loc=%llu, eod %hhu\n",
-                 (u16)(s & STATE_MASK), loc, eod);
+                     u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                     u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
 
     if (!eod && s == *cached_accept_state) {
         if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
@@ -89,27 +89,108 @@ char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
 }
 
 static really_inline
-char mcclellanExec16_i(const struct mcclellan *m, u16 *state, const u8 *buf,
+const u8 *run_mcclellan_accel(const struct mcclellan *m,
+                              const struct mstate_aux *aux, u32 s,
+                              const u8 **min_accel_offset,
+                              const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal16(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
+               u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcclellan));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    s &= STATE_MASK;
+
+    while (c < end && s) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcclellanExec16_i(const struct mcclellan *m, u32 *state, const u8 *buf,
                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                        char single, const u8 **c_final, enum MatchMode mode) {
     assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
 
-    u16 s = *state;
-    const u8 *c = buf, *c_end = buf + len;
-    const u16 *succ_table = (const u16 *)((const char *)m
-                                          + sizeof(struct mcclellan));
-    assert(ISALIGNED_N(succ_table, 2));
-    const u16 sherman_base = m->sherman_limit;
-    const char *sherman_base_offset
-        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
-    const u32 as = m->alphaShift;
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
 
     s &= STATE_MASK;
 
     u32 cached_accept_id = 0;
-    u16 cached_accept_state = 0;
+    u32 cached_accept_state = 0;
 
-    DEBUG_PRINTF("s: %hu, len %zu\n", s, len);
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
 
     const u8 *min_accel_offset = c;
     if (!m->has_accel || len < ACCEL_MIN_LEN) {
@@ -120,26 +201,19 @@ char mcclellanExec16_i(const struct mcclellan *m, u16 *state, const u8 *buf,
     goto with_accel;
 
 without_accel:
-    while (c < min_accel_offset && s) {
-        u8 cprime = m->remap[*(c++)];
-        DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s);
-        if (s < sherman_base) {
-            DEBUG_PRINTF("doing normal\n");
-            assert(s < m->state_count);
-            s = succ_table[((u32)s << as) + cprime];
-        } else {
-            const char *sherman_state
-                = findShermanState(m, sherman_base_offset, sherman_base, s);
-            DEBUG_PRINTF("doing sherman (%hu)\n", s);
-            s = doSherman16(sherman_state, cprime, succ_table, as);
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
         }
-        DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK));
+
+        s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
 
         if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
             if (mode == STOP_AT_MATCH) {
                 *state = s & STATE_MASK;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
@@ -147,39 +221,51 @@ without_accel:
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING; /* termination requested */
+                    return MO_DEAD; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
-                                       &cached_accept_state,
-                                       &cached_accept_id) == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
             }
         }
 
-        s &= STATE_MASK;
+        assert(c <= min_accel_offset);
+    } while (c < min_accel_offset);
+
+    s &= STATE_MASK;
+
+    if (c == c_end) {
+        goto exit;
+    } else {
+        goto with_accel;
     }
 
 with_accel:
-    while (c < c_end && s) {
-        u8 cprime = m->remap[*(c++)];
-        DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s);
-        if (s < sherman_base) {
-            DEBUG_PRINTF("doing normal\n");
-            assert(s < m->state_count);
-            s = succ_table[((u32)s << as) + cprime];
-        } else {
-            const char *sherman_state
-                = findShermanState(m, sherman_base_offset, sherman_base, s);
-            DEBUG_PRINTF("doing sherman (%hu)\n", s);
-            s = doSherman16(sherman_state, cprime, succ_table, as);
+    do {
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
         }
-        DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK));
+
+        if (s & ACCEL_FLAG) {
+            DEBUG_PRINTF("skipping\n");
+            s &= STATE_MASK;
+            c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end);
+            if (c == c_end) {
+                goto exit;
+            } else {
+                goto without_accel;
+            }
+        }
+
+        s = doNormal16(m, &c, c_end, s, 1, mode);
 
         if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
             if (mode == STOP_AT_MATCH) {
                 *state = s & STATE_MASK;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
@@ -187,56 +273,31 @@ with_accel:
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING; /* termination requested */
+                    return MO_DEAD; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
-                                       &cached_accept_state,
-                                       &cached_accept_id) == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
             }
-        } else if (s & ACCEL_FLAG) {
-            DEBUG_PRINTF("skipping\n");
-            const struct mstate_aux *this_aux = get_aux(m, s & STATE_MASK);
-            u32 accel_offset = this_aux->accel_offset;
-
-            assert(accel_offset >= m->aux_offset);
-            assert(accel_offset < m->sherman_offset);
-
-            const union AccelAux *aaux
-                = (const void *)((const char *)m + accel_offset);
-            const u8 *c2 = run_accel(aaux, c, c_end);
-
-            if (c2 < min_accel_offset + BAD_ACCEL_DIST) {
-                min_accel_offset = c2 + BIG_ACCEL_PENALTY;
-            } else {
-                min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
-            }
-
-            if (min_accel_offset >= c_end - ACCEL_MIN_LEN) {
-                min_accel_offset = c_end;
-            }
-
-            DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
-                         c2 - c, min_accel_offset - c2, c_end - c2);
-
-            c = c2;
-            s &= STATE_MASK;
-            goto without_accel;
         }
 
-        s &= STATE_MASK;
-    }
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
 
     if (mode == STOP_AT_MATCH) {
         *c_final = c_end;
     }
     *state = s;
 
-    return MO_CONTINUE_MATCHING;
+    return MO_ALIVE;
 }
 
 static never_inline
-char mcclellanExec16_i_cb(const struct mcclellan *m, u16 *state, const u8 *buf,
+char mcclellanExec16_i_cb(const struct mcclellan *m, u32 *state, const u8 *buf,
                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                           char single, const u8 **final_point) {
     return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
@@ -244,7 +305,7 @@ char mcclellanExec16_i_cb(const struct mcclellan *m, u16 *state, const u8 *buf,
 }
 
 static never_inline
-char mcclellanExec16_i_sam(const struct mcclellan *m, u16 *state, const u8 *buf,
+char mcclellanExec16_i_sam(const struct mcclellan *m, u32 *state, const u8 *buf,
                            size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                            char single, const u8 **final_point) {
     return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
@@ -252,15 +313,15 @@ char mcclellanExec16_i_sam(const struct mcclellan *m, u16 *state, const u8 *buf,
 }
 
 static never_inline
-char mcclellanExec16_i_nm(const struct mcclellan *m, u16 *state, const u8 *buf,
-                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
-                           char single, const u8 **final_point) {
+char mcclellanExec16_i_nm(const struct mcclellan *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
     return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
                              final_point, NO_MATCHES);
 }
 
 static really_inline
-char mcclellanExec16_i_ni(const struct mcclellan *m, u16 *state, const u8 *buf,
+char mcclellanExec16_i_ni(const struct mcclellan *m, u32 *state, const u8 *buf,
                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                           char single, const u8 **final_point,
                           enum MatchMode mode) {
@@ -271,35 +332,69 @@ char mcclellanExec16_i_ni(const struct mcclellan *m, u16 *state, const u8 *buf,
         return mcclellanExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
                                      single, final_point);
     } else {
-        assert (mode == NO_MATCHES);
+        assert(mode == NO_MATCHES);
         return mcclellanExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
                                     single, final_point);
     }
 }
 
 static really_inline
-char mcclellanExec8_i(const struct mcclellan *m, u8 *state, const u8 *buf,
-                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
-                      char single, const u8 **c_final, enum MatchMode mode) {
-    u8 s = *state;
-    const u8 *c = buf, *c_end = buf + len;
+u32 doNormal8(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
+              u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
     const u8 *succ_table = (const u8 *)((const char *)m
                                         + sizeof(struct mcclellan));
-    const u32 as = m->alphaShift;
-    const struct mstate_aux *aux;
+    while (c < end && s) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
 
-    aux = (const struct mstate_aux *)((const char *)m + m->aux_offset
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,
+                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                      char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
                                       - sizeof(struct NFA));
-
-    u16 accel_limit = m->accel_limit_8;
-    u16 accept_limit = m->accept_limit_8;
+    u32 accept_limit = m->accept_limit_8;
 
     u32 cached_accept_id = 0;
-    u16 cached_accept_state = 0;
+    u32 cached_accept_state = 0;
 
-    DEBUG_PRINTF("accel %hu, accept %hu\n", accel_limit, accept_limit);
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
 
-    DEBUG_PRINTF("s: %hhu, len %zu\n", s, len);
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
 
     const u8 *min_accel_offset = c;
     if (!m->has_accel || len < ACCEL_MIN_LEN) {
@@ -310,124 +405,119 @@ char mcclellanExec8_i(const struct mcclellan *m, u8 *state, const u8 *buf,
     goto with_accel;
 
 without_accel:
-    while (c < min_accel_offset && s) {
-        u8 cprime = m->remap[*(c++)];
-        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1),
-                     ourisprint(*(c-1)) ? *(c-1) : '?', cprime);
-        s = succ_table[((u32)s << as) + cprime];
-        DEBUG_PRINTF("s: %hhu\n", s);
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        }
+
+        s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
 
         if (mode != NO_MATCHES && s >= accept_limit) {
             if (mode == STOP_AT_MATCH) {
                 DEBUG_PRINTF("match - pausing\n");
                 *state = s;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
+                    return MO_DEAD;
                 }
             } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
-                                       &cached_accept_state,
-                                       &cached_accept_id)
+                                       &cached_accept_state, &cached_accept_id)
                        == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                return MO_DEAD;
             }
         }
+
+        assert(c <= min_accel_offset);
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
     }
 
 with_accel:
-    while (c < c_end && s) {
-        u8 cprime = m->remap[*(c++)];
-        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1),
-                     ourisprint(*(c-1)) ? *(c-1) : '?', cprime);
-        s = succ_table[((u32)s << as) + cprime];
-        DEBUG_PRINTF("s: %hhu\n", s);
+    do {
+        u32 accel_limit = m->accel_limit_8;
+        assert(c < c_end);
 
-        if (s >= accel_limit) { /* accept_limit >= accel_limit */
-            if (mode != NO_MATCHES && s >= accept_limit) {
-                if (mode == STOP_AT_MATCH) {
-                    DEBUG_PRINTF("match - pausing\n");
-                    *state = s;
-                    *c_final = c - 1;
-                    return MO_CONTINUE_MATCHING;
-                }
+        if (!s) {
+            goto exit;
+        }
 
-                u64a loc = (c - 1) - buf + offAdj + 1;
-                if (single) {
-                    DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                    if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                        return MO_HALT_MATCHING;
-                    }
-                } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
-                                           &cached_accept_state,
-                                           &cached_accept_id)
-                           == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
-                }
-            } else if (aux[s].accel_offset) {
-                DEBUG_PRINTF("skipping\n");
-
-                const union AccelAux *aaux = (const void *)((const char *)m
-                                                         + aux[s].accel_offset);
-                const u8 *c2 = run_accel(aaux, c, c_end);
-
-                if (c2 < min_accel_offset + BAD_ACCEL_DIST) {
-                    min_accel_offset = c2 + BIG_ACCEL_PENALTY;
-                } else {
-                    min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
-                }
-
-                if (min_accel_offset >= c_end - ACCEL_MIN_LEN) {
-                    min_accel_offset = c_end;
-                }
-
-                DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
-                         c2 - c, min_accel_offset - c2, c_end - c2);
-
-                c = c2;
+        if (s >= accel_limit && aux[s].accel_offset) {
+            c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end);
+            if (c == c_end) {
+                goto exit;
+            } else {
                 goto without_accel;
             }
         }
-    }
+        s = doNormal8(m, &c, c_end, s, 1, mode);
 
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
     *state = s;
     if (mode == STOP_AT_MATCH) {
         *c_final = c_end;
     }
-    return MO_CONTINUE_MATCHING;
+    return MO_ALIVE;
 }
 
 static never_inline
-char mcclellanExec8_i_cb(const struct mcclellan *m, u8 *state, const u8 *buf,
+char mcclellanExec8_i_cb(const struct mcclellan *m, u32 *state, const u8 *buf,
                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                          char single, const u8 **final_point) {
     return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
-                            final_point, CALLBACK_OUTPUT);
+                          final_point, CALLBACK_OUTPUT);
 }
 
 static never_inline
-char mcclellanExec8_i_sam(const struct mcclellan *m, u8 *state, const u8 *buf,
+char mcclellanExec8_i_sam(const struct mcclellan *m, u32 *state, const u8 *buf,
                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                           char single, const u8 **final_point) {
     return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
-                            final_point, STOP_AT_MATCH);
+                          final_point, STOP_AT_MATCH);
 }
 
 static never_inline
-char mcclellanExec8_i_nm(const struct mcclellan *m, u8 *state, const u8 *buf,
+char mcclellanExec8_i_nm(const struct mcclellan *m, u32 *state, const u8 *buf,
                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                          char single, const u8 **final_point) {
     return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
-                            final_point, NO_MATCHES);
+                          final_point, NO_MATCHES);
 }
 
 static really_inline
-char mcclellanExec8_i_ni(const struct mcclellan *m, u8 *state, const u8 *buf,
+char mcclellanExec8_i_ni(const struct mcclellan *m, u32 *state, const u8 *buf,
                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
                          char single, const u8 **final_point,
                          enum MatchMode mode) {
@@ -445,7 +535,7 @@ char mcclellanExec8_i_ni(const struct mcclellan *m, u8 *state, const u8 *buf,
 }
 
 static really_inline
-char mcclellanCheckEOD(const struct NFA *nfa, u16 s, u64a offset,
+char mcclellanCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
                        NfaCallback cb, void *ctxt) {
     const struct mcclellan *m = getImplNfa(nfa);
     const struct mstate_aux *aux = get_aux(m, s);
@@ -466,7 +556,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
     s64a sp;
 
     assert(ISALIGNED_N(q->state, 2));
-    u16 s = *(u16 *)q->state;
+    u32 s = *(u16 *)q->state;
 
     if (q->report_current) {
         assert(s);
@@ -478,7 +568,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
-            u16 cached_accept_state = 0;
+            u32 cached_accept_state = 0;
 
             rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
                                  &cached_accept_state, &cached_accept_id);
@@ -487,7 +577,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         q->report_current = 0;
 
         if (rv == MO_HALT_MATCHING) {
-            return MO_HALT_MATCHING;
+            return MO_DEAD;
         }
     }
 
@@ -496,12 +586,6 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     const u8 *cur_buf = sp < 0 ? hend : buffer;
 
-    char report = 1;
-    if (mode == CALLBACK_OUTPUT) {
-        /* we are starting inside the history buffer: matches are suppressed */
-        report = !(sp < 0);
-    }
-
     assert(q->cur);
     if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
         DEBUG_PRINTF("this is as far as we go\n");
@@ -528,19 +612,20 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         /* do main buffer region */
         const u8 *final_look;
-        if (mcclellanExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
-                                 offset + sp, cb, context, single, &final_look,
-                                 report ? mode : NO_MATCHES)
-            == MO_HALT_MATCHING) {
-            assert(report);
+        char rv = mcclellanExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                      offset + sp, cb, context, single,
+                                      &final_look, mode);
+        if (rv == MO_DEAD) {
             *(u16 *)q->state = 0;
-            return 0;
+            return MO_DEAD;
         }
-        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
             DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
             assert(q->cur);
-            DEBUG_PRINTF("state %hu final_look %zd\n", s,
-                          final_look - cur_buf);
+            assert(final_look != cur_buf + local_ep);
+
             q->cur--;
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = final_look - cur_buf + 1; /* due to
@@ -549,6 +634,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             return MO_MATCHES_PENDING;
         }
 
+        assert(rv == MO_ALIVE);
         assert(q->cur);
         if (mode != NO_MATCHES && q->items[q->cur].location > end) {
             DEBUG_PRINTF("this is as far as we go\n");
@@ -563,7 +649,6 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         if (sp == 0) {
             cur_buf = buffer;
-            report = 1;
         }
 
         if (sp != ep) {
@@ -582,7 +667,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         case MQE_END:
             *(u16 *)q->state = s;
             q->cur++;
-            return s ? MO_ALIVE : 0;
+            return s ? MO_ALIVE : MO_DEAD;
         default:
             assert(!"invalid queue event");
         }
@@ -591,18 +676,18 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
     }
 }
 
-static really_inline really_flatten
-char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset,
-                           const u8 *buffer, size_t length,
-                           NfaCallback cb, void *context, char single) {
+static really_inline
+char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
+                           size_t length, NfaCallback cb, void *context,
+                           char single) {
     assert(n->type == MCCLELLAN_NFA_16);
     const struct mcclellan *m = getImplNfa(n);
-    u16 s = m->start_anchored;
+    u32 s = m->start_anchored;
 
     if (mcclellanExec16_i(m, &s, buffer, length, offset, cb, context, single,
                           NULL, CALLBACK_OUTPUT)
-        == MO_HALT_MATCHING) {
-        return 0;
+        == MO_DEAD) {
+        return s ? MO_ALIVE : MO_DEAD;
     }
 
     const struct mstate_aux *aux = get_aux(m, s);
@@ -611,19 +696,19 @@ char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset,
         doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
     }
 
-    return !!s;
+    return MO_ALIVE;
 }
 
 static really_inline
 char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
-                          const u8 *hend, NfaCallback cb, void *context,
-                          struct mq *q, char single, s64a end,
-                          enum MatchMode mode) {
+                           const u8 *hend, NfaCallback cb, void *context,
+                           struct mq *q, char single, s64a end,
+                           enum MatchMode mode) {
     assert(n->type == MCCLELLAN_NFA_8);
     const struct mcclellan *m = getImplNfa(n);
     s64a sp;
 
-    u8 s = *(u8 *)q->state;
+    u32 s = *(u8 *)q->state;
 
     if (q->report_current) {
         assert(s);
@@ -635,7 +720,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
-            u16 cached_accept_state = 0;
+            u32 cached_accept_state = 0;
 
             rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
                                  &cached_accept_state, &cached_accept_id);
@@ -644,7 +729,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         q->report_current = 0;
 
         if (rv == MO_HALT_MATCHING) {
-            return MO_HALT_MATCHING;
+            return MO_DEAD;
         }
     }
 
@@ -653,12 +738,6 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     const u8 *cur_buf = sp < 0 ? hend : buffer;
 
-    char report = 1;
-    if (mode == CALLBACK_OUTPUT) {
-        /* we are starting inside the history buffer: matches are suppressed */
-        report = !(sp < 0);
-    }
-
     if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
         DEBUG_PRINTF("this is as far as we go\n");
         q->cur--;
@@ -686,17 +765,20 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         }
 
         const u8 *final_look;
-        if (mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, offset + sp,
-                                cb, context, single, &final_look,
-                                report ? mode : NO_MATCHES)
-            == MO_HALT_MATCHING) {
+        char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
             *(u8 *)q->state = 0;
-            return 0;
+            return MO_DEAD;
         }
-        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
-            /* found a match */
-            DEBUG_PRINTF("found a match\n");
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
             assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
             q->cur--;
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = final_look - cur_buf + 1; /* due to
@@ -705,6 +787,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             return MO_MATCHES_PENDING;
         }
 
+        assert(rv == MO_ALIVE);
         assert(q->cur);
         if (mode != NO_MATCHES && q->items[q->cur].location > end) {
             DEBUG_PRINTF("this is as far as we go\n");
@@ -720,7 +803,6 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         if (sp == 0) {
             cur_buf = buffer;
-            report = 1;
         }
 
         if (sp != ep) {
@@ -739,7 +821,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         case MQE_END:
             *(u8 *)q->state = s;
             q->cur++;
-            return s ? MO_ALIVE : 0;
+            return s ? MO_ALIVE : MO_DEAD;
         default:
             assert(!"invalid queue event");
         }
@@ -748,18 +830,18 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
     }
 }
 
-static really_inline really_flatten
+static really_inline
 char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
                           size_t length, NfaCallback cb, void *context,
                           char single) {
     assert(n->type == MCCLELLAN_NFA_8);
     const struct mcclellan *m = getImplNfa(n);
-    u8 s = (u8)m->start_anchored;
+    u32 s = m->start_anchored;
 
     if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single,
                          NULL, CALLBACK_OUTPUT)
-        == MO_HALT_MATCHING) {
-        return 0;
+        == MO_DEAD) {
+        return MO_DEAD;
     }
 
     const struct mstate_aux *aux = get_aux(m, s);
@@ -768,7 +850,7 @@ char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
         doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
     }
 
-    return s;
+    return s ? MO_ALIVE : MO_DEAD;
 }
 
 char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer,
@@ -827,7 +909,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = getImplNfa(n);
     NfaCallback cb = q->cb;
     void *ctxt = q->context;
-    u8 s = *(u8 *)q->state;
+    u32 s = *(u8 *)q->state;
     u8 single = m->flags & MCCLELLAN_FLAG_SINGLE;
     u64a offset = q_cur_offset(q);
     assert(q_cur_type(q) == MQE_START);
@@ -839,7 +921,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
             cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
-            u16 cached_accept_state = 0;
+            u32 cached_accept_state = 0;
 
             doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
                             &cached_accept_id);
@@ -853,12 +935,12 @@ char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = getImplNfa(n);
     NfaCallback cb = q->cb;
     void *ctxt = q->context;
-    u16 s = *(u16 *)q->state;
+    u32 s = *(u16 *)q->state;
     const struct mstate_aux *aux = get_aux(m, s);
     u8 single = m->flags & MCCLELLAN_FLAG_SINGLE;
     u64a offset = q_cur_offset(q);
     assert(q_cur_type(q) == MQE_START);
-    DEBUG_PRINTF("state %hu\n", s);
+    DEBUG_PRINTF("state %u\n", s);
     assert(s);
 
     if (aux->accept) {
@@ -867,7 +949,7 @@ char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
             cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
-            u16 cached_accept_state = 0;
+            u32 cached_accept_state = 0;
 
             doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
                             &cached_accept_id);
@@ -1041,7 +1123,7 @@ void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
                                   size_t len, NfaCallback cb, void *ctxt) {
     const struct mcclellan *m = getImplNfa(nfa);
 
-    u8 s = top ? m->start_anchored : *(u8 *)state;
+    u32 s = top ? m->start_anchored : *(u8 *)state;
 
     if (m->flags & MCCLELLAN_FLAG_SINGLE) {
         mcclellanExec8_i(m, &s, buf + start_off, len - start_off,
@@ -1059,14 +1141,14 @@ void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
                                    size_t len, NfaCallback cb, void *ctxt) {
     const struct mcclellan *m = getImplNfa(nfa);
 
-    u16 s = top ? m->start_anchored : unaligned_load_u16(state);
+    u32 s = top ? m->start_anchored : unaligned_load_u16(state);
 
     if (m->flags & MCCLELLAN_FLAG_SINGLE) {
         mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
-                         start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
+                          start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
     } else {
         mcclellanExec16_i(m, &s, buf + start_off, len - start_off,
-                         start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
+                          start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
     }
 
     unaligned_store_u16(state, s);
@@ -1087,13 +1169,15 @@ char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
                              context);
 }
 
-char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa,
+                                      struct mq *q) {
     assert(nfa->scratchStateSize == 1);
     *(u8 *)q->state = 0;
     return 0;
 }
 
-char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa,
+                                       struct mq *q) {
     assert(nfa->scratchStateSize == 2);
     assert(ISALIGNED_N(q->state, 2));
     *(u16 *)q->state = 0;
diff --git a/src/nfa/mcclellan_common_impl.h b/src/nfa/mcclellan_common_impl.h
index e3bcf43e..be130715 100644
--- a/src/nfa/mcclellan_common_impl.h
+++ b/src/nfa/mcclellan_common_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,14 +26,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if defined(__INTEL_COMPILER) || defined(__clang__) || defined(_WIN32) || defined(__GNUC__) && (__GNUC__ < 4)
-#define really_flatten
-#else
-#define really_flatten __attribute__ ((flatten))
-#endif
-
-#define CASE_MASK 0xdf
-
 enum MatchMode {
     CALLBACK_OUTPUT,
     STOP_AT_MATCH,
@@ -41,7 +33,7 @@ enum MatchMode {
 };
 
 static really_inline
-const struct mstate_aux *get_aux(const struct mcclellan *m, u16 s) {
+const struct mstate_aux *get_aux(const struct mcclellan *m, u32 s) {
     const char *nfa = (const char *)m - sizeof(struct NFA);
     const struct mstate_aux *aux
         = s + (const struct mstate_aux *)(nfa + m->aux_offset);
@@ -51,15 +43,15 @@ const struct mstate_aux *get_aux(const struct mcclellan *m, u16 s) {
 }
 
 static really_inline
-u16 mcclellanEnableStarts(const struct mcclellan *m, u16 s) {
+u32 mcclellanEnableStarts(const struct mcclellan *m, u32 s) {
     const struct mstate_aux *aux = get_aux(m, s);
 
-    DEBUG_PRINTF("enabling starts %hu->%hu\n", s, aux->top);
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
     return aux->top;
 }
 
 static really_inline
-u16 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
+u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
                 u32 as) {
     assert(ISALIGNED_N(sherman_state, 16));
 
@@ -78,15 +70,15 @@ u16 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
         if (z) {
             u32 i = ctz32(z & ~0xf) - 4;
 
-            u16 s_out = unaligned_load_u16((const u8 *)sherman_state
+            u32 s_out = unaligned_load_u16((const u8 *)sherman_state
                                            + SHERMAN_STATES_OFFSET(len)
                                            + sizeof(u16) * i);
-            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu "
-                         "s=%hu\n", i, len, cprime, s_out);
+            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i,
+                         len, cprime, s_out);
             return s_out;
         }
     }
 
-    u16 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
-    return succ_table[((u32)daddy << as) + cprime];
+    u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+    return succ_table[(daddy << as) + cprime];
 }
diff --git a/src/nfa/mcclellan_internal.h b/src/nfa/mcclellan_internal.h
index aad296c4..549bccf5 100644
--- a/src/nfa/mcclellan_internal.h
+++ b/src/nfa/mcclellan_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,17 +71,17 @@ struct mcclellan {
     u16 start_floating; /**< floating start state */
     u32 aux_offset; /**< offset of the aux structures relative to the start of
                      *  the nfa structure */
-    u32 sherman_offset; /**< offset of to array of sherman state offsets
-                      * the state_info structures relative to the start of the
-                      * nfa structure */
-    u32 sherman_end; /**< offset of the end of the state_info structures relative
-                   *  to the start of the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      *  relative to the start of the nfa structure */
     u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
     u16 accept_limit_8; /**< 8 bit, lowest accept state */
     u16 sherman_limit; /**< lowest sherman state */
     u8  alphaShift;
     u8  flags;
-    u8  has_accel; /**< 1 iff there are any accel planes */
+    u8  has_accel; /**< 1 iff there are any accel plans */
     u8  remap[256]; /**< remaps characters to a smaller alphabet */
     ReportID arb_report; /**< one of the accepts that this dfa may raise */
     u32 accel_offset; /**< offset of the accel structures from start of NFA */
@@ -90,8 +90,8 @@ struct mcclellan {
 
 static really_inline
 const char *findShermanState(UNUSED const struct mcclellan *m,
-                             const char *sherman_base_offset, u16 sherman_base,
-                             u16 s) {
+                             const char *sherman_base_offset, u32 sherman_base,
+                             u32 s) {
     const char *rv
         = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
     assert(rv < (const char *)m + m->length - sizeof(struct NFA));
@@ -102,7 +102,7 @@ const char *findShermanState(UNUSED const struct mcclellan *m,
 
 static really_inline
 char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base,
-                              u16 s) {
+                              u32 s) {
     return sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
 }
 
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 09006d5b..7a73c9d4 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -415,9 +415,9 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
                              : info.raw.start_floating);
 }
 
-/* returns non-zero on error */
+/* returns false on error */
 static
-int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
+bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     info.states[0].impl_id = 0; /* dead is always 0 */
 
     vector<dstate_id_t> norm;
@@ -426,7 +426,7 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     if (info.size() > (1 << 16)) {
         DEBUG_PRINTF("too many states\n");
         *sherman_base = 0;
-        return 1;
+        return false;
     }
 
     for (u32 i = 1; i < info.size(); i++) {
@@ -452,7 +452,7 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     /* Check to see if we haven't over allocated our states */
     DEBUG_PRINTF("next sherman %u masked %u\n", next_sherman,
                  (dstate_id_t)(next_sherman & STATE_MASK));
-    return (next_sherman - 1) != ((next_sherman - 1) & STATE_MASK);
+    return (next_sherman - 1) == ((next_sherman - 1) & STATE_MASK);
 }
 
 static
@@ -470,7 +470,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
     assert(alphaShift <= 8);
 
     u16 count_real_states;
-    if (allocateFSN16(info, &count_real_states)) {
+    if (!allocateFSN16(info, &count_real_states)) {
         DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
                      info.size());
         return nullptr;
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index e6f548a7..8d8dfb19 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -32,9 +32,7 @@
 #include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
-#include "util/accel_scheme.h"
 #include "util/alloc.h"
-#include "util/charreach.h"
 #include "util/ue2_containers.h"
 
 #include <memory>
diff --git a/src/nfa/mcclellandump.cpp b/src/nfa/mcclellandump.cpp
index dcbb0915..9e04ad63 100644
--- a/src/nfa/mcclellandump.cpp
+++ b/src/nfa/mcclellandump.cpp
@@ -39,6 +39,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 #include "util/unaligned.h"
 
 #include <cctype>
@@ -267,8 +268,8 @@ void dumpDotPreambleDfa(FILE *f) {
     fprintf(f, "0 [style=invis];\n");
 }
 
-void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f,
-                                UNUSED const string &base) {
+static
+void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
     assert(nfa->type == MCCLELLAN_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -287,8 +288,8 @@ void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f,
     fprintf(f, "}\n");
 }
 
-void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f,
-                               UNUSED const string &base) {
+static
+void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f) {
     assert(nfa->type == MCCLELLAN_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -397,6 +398,7 @@ void dumpTransitions(FILE *f, const NFA *nfa, const mcclellan *m,
     }
 }
 
+static
 void nfaExecMcClellan16_dumpText(const NFA *nfa, FILE *f) {
     assert(nfa->type == MCCLELLAN_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
@@ -417,6 +419,7 @@ void nfaExecMcClellan16_dumpText(const NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
+static
 void nfaExecMcClellan8_dumpText(const NFA *nfa, FILE *f) {
     assert(nfa->type == MCCLELLAN_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
@@ -437,4 +440,24 @@ void nfaExecMcClellan8_dumpText(const NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
+void nfaExecMcClellan16_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCCLELLAN_NFA_16);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    nfaExecMcClellan16_dumpText(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    nfaExecMcClellan16_dumpDot(nfa, f);
+    fclose(f);
+}
+
+void nfaExecMcClellan8_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCCLELLAN_NFA_8);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    nfaExecMcClellan8_dumpText(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    nfaExecMcClellan8_dumpDot(nfa, f);
+    fclose(f);
+}
+
 } // namespace ue2
diff --git a/src/nfa/mcclellandump.h b/src/nfa/mcclellandump.h
index efa61544..5b63a206 100644
--- a/src/nfa/mcclellandump.h
+++ b/src/nfa/mcclellandump.h
@@ -43,14 +43,10 @@ union AccelAux;
 
 namespace ue2 {
 
-void nfaExecMcClellan8_dumpDot(const struct NFA *nfa, FILE *file,
-                               const std::string &base);
-void nfaExecMcClellan16_dumpDot(const struct NFA *nfa, FILE *file,
-                                const std::string &base);
-void nfaExecMcClellan8_dumpText(const struct NFA *nfa, FILE *file);
-void nfaExecMcClellan16_dumpText(const struct NFA *nfa, FILE *file);
+void nfaExecMcClellan8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcClellan16_dump(const struct NFA *nfa, const std::string &base);
 
-/* These functions are shared with the Haig dump code. */
+/* These functions are shared with the Gough dump code. */
 
 const mstate_aux *getAux(const NFA *n, dstate_id_t i);
 void describeEdge(FILE *f, const u16 *t, u16 i);
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
new file mode 100644
index 00000000..98db3f0a
--- /dev/null
+++ b/src/nfa/mcsheng.c
@@ -0,0 +1,1406 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng.h"
+
+#include "accel.h"
+#include "mcsheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/simd_utils.h"
+#include "ue2common.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct mstate_aux *get_aux(const struct mcsheng *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts(const struct mcsheng *m, u32 s) {
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
+                u32 as) {
+    assert(ISALIGNED_N(sherman_state, 16));
+
+    u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET);
+
+    if (len) {
+        m128 ss_char = load128(sherman_state);
+        m128 cur_char = set16x8(cprime);
+
+        u32 z = movemask128(eq128(ss_char, cur_char));
+
+        /* remove header cruft: type 1, len 1, daddy 2*/
+        z &= ~0xf;
+        z &= (1U << (len + 4)) - 1;
+
+        if (z) {
+            u32 i = ctz32(z & ~0xf) - 4;
+
+            u32 s_out = unaligned_load_u16((const u8 *)sherman_state
+                                           + SHERMAN_STATES_OFFSET(len)
+                                           + sizeof(u16) * i);
+            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i,
+                         len, cprime, s_out);
+            return s_out;
+        }
+    }
+
+    u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+    return succ_table[(daddy << as) + cprime];
+}
+
+static really_inline
+char doComplexReport(NfaCallback cb, void *ctxt, const struct mcsheng *m,
+                     u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                     u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+#define SHENG_CHUNK 8
+
+static really_inline
+u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
+            const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m128 s = set16x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+    const m128 *masks = m->sheng_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
+    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hu, accel %hhu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG_SINGLE_ITER do {                                          \
+        m128 shuffle_mask = masks[*(c++)];                              \
+        s = pshufb(shuffle_mask, s);                                    \
+        u32 s_gpr_x4 = movd(s); /* convert to u8 */                     \
+        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr);  \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                          \
+            s_gpr = s_gpr_x4;                                           \
+            goto exit;                                                  \
+        }                                                               \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+        m128 shuffle_mask0 = load128((const char *)masks + cc0);
+        s = pshufb(shuffle_mask0, s);
+        m128 s_max = s;
+        m128 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %hhu\n", cc0 >> 4, movd(s));
+
+#define SHENG_SINGLE_UNROLL_ITER(iter)                                  \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]);    \
+        assert(cc##iter == (u64a)c[iter] << 4);                         \
+        m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \
+        s = pshufb(shuffle_mask##iter, s);                              \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m128 s_temp = sadd_u8_m128(s, accel_delta);                 \
+            s_max = max_u8_m128(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m128(s_max, s);                              \
+        }                                                               \
+        m128 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %hhu max %hhu\n", cc##iter >> 4,   \
+                     movd(s), movd(s_max));
+
+        SHENG_SINGLE_UNROLL_ITER(1);
+
+        SHENG_SINGLE_UNROLL_ITER(2);
+        SHENG_SINGLE_UNROLL_ITER(3);
+
+        SHENG_SINGLE_UNROLL_ITER(4);
+        SHENG_SINGLE_UNROLL_ITER(5);
+
+        SHENG_SINGLE_UNROLL_ITER(6);
+        SHENG_SINGLE_UNROLL_ITER(7);
+
+        if (movd(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m128 blended = rshift64_m128(s_max0, 56);
+            blended = xor128(blended, rshift64_m128(s_max1, 48));
+            blended = xor128(blended, rshift64_m128(s_max2, 40));
+            blended = xor128(blended, rshift64_m128(s_max3, 32));
+            blended = xor128(blended, rshift64_m128(s_max4, 24));
+            blended = xor128(blended, rshift64_m128(s_max5, 16));
+            blended = xor128(blended, rshift64_m128(s_max6, 8));
+            blended = xor128(blended, s);
+            blended = xor128(blended, rshift64_m128(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq(blended));
+
+            m128 final = min_u8_m128(blended, simd_stop_limit);
+            m128 cmp = sub_u8_m128(final, simd_stop_limit);
+            u64a stops = ~movemask128(cmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG_SINGLE_ITER;
+        case 6:
+            SHENG_SINGLE_ITER;
+        case 5:
+            SHENG_SINGLE_ITER;
+        case 4:
+            SHENG_SINGLE_ITER;
+        case 3:
+            SHENG_SINGLE_ITER;
+        case 2:
+            SHENG_SINGLE_ITER;
+        case 1:
+            SHENG_SINGLE_ITER;
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movd(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState(UNUSED const struct mcsheng *m,
+                             const char *sherman_base_offset, u32 sherman_base,
+                             u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel(const struct mcsheng *m,
+                            const struct mstate_aux *aux, u32 s,
+                            const u8 **min_accel_offset,
+                            const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal16(const struct mcsheng *m, const u8 **c_inout, const u8 *end,
+               u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    s &= STATE_MASK;
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec16_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                     size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                     char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec16_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec16_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec16_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec16_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point,
+                        enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                   single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcshengExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal8(const struct mcsheng *m, const u8 **c_inout, const u8 *end, u32 s,
+              char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec8_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                    size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                    char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec8_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec8_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec8_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec8_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point,
+                       enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcshengExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
+                     NfaCallback cb, void *ctxt) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                          const u8 *hend, NfaCallback cb, void *context,
+                          struct mq *q, char single, s64a end,
+                          enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcshengExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                         const u8 *hend, NfaCallback cb, void *context,
+                         struct mq *q, char single, s64a end,
+                         enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcshengExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                    offset + sp, cb, context, single,
+                                    &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                  NO_MATCHES);
+    if (rv && nfaExecMcSheng8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                   NO_MATCHES);
+
+    if (rv && nfaExecMcSheng16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                         void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                          void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             UNUSED const char *streamState, u64a offset,
+                             NfaCallback callback, void *context) {
+    return mcshengCheckEOD(nfa, *(const u8 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              UNUSED const char *streamState, u64a offset,
+                              NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD(nfa, *(const u16 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng8_queueCompressState(UNUSED const struct NFA *nfa,
+                                        const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                 const void *src, UNUSED u64a offset,
+                                 UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueCompressState(UNUSED const struct NFA *nfa,
+                                         const struct mq *q,
+                                         UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                  const void *src, UNUSED u64a offset,
+                                  UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
diff --git a/src/nfa/mcsheng.h b/src/nfa/mcsheng.h
new file mode 100644
index 00000000..19fd6961
--- /dev/null
+++ b/src/nfa/mcsheng.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_H
+#define MCSHENG_H
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+/* 8-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context);
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_initCompressedState(const struct NFA *n, u64a offset,
+                                         void *state, u8 key);
+char nfaExecMcSheng8_queueCompressState(const struct NFA *nfa,
+                                        const struct mq *q, s64a loc);
+char nfaExecMcSheng8_expandState(const struct NFA *nfa, void *dest,
+                                 const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 16-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, void *context);
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_initCompressedState(const struct NFA *n, u64a offset,
+                                          void *state, u8 key);
+char nfaExecMcSheng16_queueCompressState(const struct NFA *nfa,
+                                         const struct mq *q, s64a loc);
+char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
+                                  const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#endif
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
new file mode 100644
index 00000000..7b4e58ab
--- /dev/null
+++ b/src/nfa/mcsheng_compile.cpp
@@ -0,0 +1,1070 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng_compile.h"
+
+#include "accel.h"
+#include "accelcompile.h"
+#include "grey.h"
+#include "mcclellancompile.h"
+#include "mcclellancompile_util.h"
+#include "mcsheng_internal.h"
+#include "nfa_internal.h"
+#include "rdfa_graph.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/ue2_containers.h"
+#include "util/unaligned.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <deque>
+#include <vector>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+namespace /* anon */ {
+
+#define MIN_SHENG_SIZE 6
+#define INVALID_SHENG_ID 255
+
+struct dstate_extra {
+    u16 daddytaken = 0;
+    bool shermanState = false;
+    bool sheng_succ = false;
+    u8 sheng_id = INVALID_SHENG_ID;
+};
+
+struct dfa_info {
+    accel_dfa_build_strat &strat;
+    raw_dfa &raw;
+    vector<dstate> &states;
+    vector<dstate_extra> extra;
+    const u16 alpha_size; /* including special symbols */
+    const array<u16, ALPHABET_SIZE> &alpha_remap;
+    vector<CharReach> rev_alpha;
+    const u16 impl_alpha_size;
+
+    u8 getAlphaShift() const;
+
+    explicit dfa_info(accel_dfa_build_strat &s)
+                                : strat(s),
+                                  raw(s.get_raw()),
+                                  states(raw.states),
+                                  extra(raw.states.size()),
+                                  alpha_size(raw.alpha_size),
+                                  alpha_remap(raw.alpha_remap),
+                                  impl_alpha_size(raw.getImplAlphaSize()) {
+        rev_alpha.resize(impl_alpha_size);
+        for (u32 i = 0; i < N_CHARS; i++) {
+            rev_alpha[alpha_remap[i]].set(i);
+        }
+    }
+
+    dstate_id_t implId(dstate_id_t raw_id) const {
+        return states[raw_id].impl_id;
+    }
+
+    bool is_sherman(dstate_id_t raw_id) const {
+        return extra[raw_id].shermanState;
+    }
+
+    bool is_sheng(dstate_id_t raw_id) const {
+        return extra[raw_id].sheng_id != INVALID_SHENG_ID;
+    }
+
+    bool is_sheng_succ(dstate_id_t raw_id) const {
+        return extra[raw_id].sheng_succ;
+    }
+
+    /* states which use the normal transition/successor table */
+    bool is_normal(dstate_id_t raw_id) const {
+        return raw_id != DEAD_STATE && !is_sheng(raw_id) && !is_sherman(raw_id);
+    }
+    size_t size(void) const { return states.size(); }
+};
+
+u8 dfa_info::getAlphaShift() const {
+    if (impl_alpha_size < 2) {
+        return 1;
+    } else {
+        /* log2 round up */
+        return 32 - clz32(impl_alpha_size - 1);
+    }
+}
+
+} // namespace
+
+static
+mstate_aux *getAux(NFA *n, dstate_id_t i) {
+    mcsheng *m = (mcsheng *)getMutableImplNfa(n);
+    mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+    mstate_aux *aux = aux_base + i;
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void createShuffleMasks(mcsheng *m, const dfa_info &info,
+                       dstate_id_t sheng_end,
+                       const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+    assert(sheng_end > DEAD_STATE + 1);
+    assert(sheng_end <= sizeof(m128) + 1);
+    vector<array<u8, sizeof(m128)>> masks;
+    masks.resize(info.alpha_size);
+    /* -1 to avoid wasting a slot as we do not include dead state */
+    vector<dstate_id_t> raw_ids;
+    raw_ids.resize(sheng_end - 1);
+    for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+        assert(info.implId(s)); /* should not map to DEAD_STATE */
+        if (info.is_sheng(s)) {
+            raw_ids[info.extra[s].sheng_id] = s;
+        }
+    }
+    for (u32 i = 0; i < info.alpha_size; i++) {
+        if (i == info.alpha_remap[TOP]) {
+            continue;
+        }
+        auto &mask = masks[i];
+        assert(sizeof(mask) == sizeof(m128));
+        mask.fill(0);
+
+        for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+            dstate_id_t raw_id = raw_ids[sheng_id];
+            dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+            if (next_id == DEAD_STATE) {
+                next_id = sheng_end - 1;
+            } else if (next_id < sheng_end) {
+                next_id--;
+            }
+            DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+            mask[sheng_id] = verify_u8(next_id);
+        }
+    }
+    for (u32 i = 0; i < N_CHARS; i++) {
+        assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+        memcpy((u8 *)&m->sheng_masks[i],
+               (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m128));
+    }
+    m->sheng_end = sheng_end;
+    m->sheng_accel_limit = sheng_end - 1;
+
+    for (dstate_id_t s : raw_ids) {
+        if (contains(accel_escape_info, s)) {
+            LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+        }
+    }
+}
+
+static
+void populateBasicInfo(size_t state_size, const dfa_info &info,
+                       u32 total_size, u32 aux_offset, u32 accel_offset,
+                       u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+    assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+    nfa->length = total_size;
+    nfa->nPositions = info.states.size();
+
+    nfa->scratchStateSize = verify_u32(state_size);
+    nfa->streamStateSize = verify_u32(state_size);
+
+    if (state_size == sizeof(u8)) {
+        nfa->type = MCSHENG_NFA_8;
+    } else {
+        nfa->type = MCSHENG_NFA_16;
+    }
+
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+    for (u32 i = 0; i < 256; i++) {
+        m->remap[i] = verify_u8(info.alpha_remap[i]);
+    }
+    m->alphaShift = info.getAlphaShift();
+    m->length = total_size;
+    m->aux_offset = aux_offset;
+    m->accel_offset = accel_offset;
+    m->arb_report = arb;
+    m->state_count = verify_u16(info.size());
+    m->start_anchored = info.implId(info.raw.start_anchored);
+    m->start_floating = info.implId(info.raw.start_floating);
+    m->has_accel = accel_count ? 1 : 0;
+
+    if (single) {
+        m->flags |= MCSHENG_FLAG_SINGLE;
+    }
+}
+
+static
+size_t calcShermanRegionSize(const dfa_info &info) {
+    size_t rv = 0;
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (info.is_sherman(i)) {
+            rv += SHERMAN_FIXED_SIZE;
+        }
+    }
+
+    return ROUNDUP_16(rv);
+}
+
+static
+void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
+               const vector<u32> &reports, const vector<u32> &reports_eod,
+               const vector<u32> &reportOffsets) {
+    const dstate &raw_state = info.states[i];
+    aux->accept = raw_state.reports.empty() ? 0 : reportOffsets[reports[i]];
+    aux->accept_eod = raw_state.reports_eod.empty() ? 0
+                                              : reportOffsets[reports_eod[i]];
+    aux->top = info.implId(i ? raw_state.next[info.alpha_remap[TOP]]
+                             : info.raw.start_floating);
+}
+
+/* returns false on error */
+static
+bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
+                     dstate_id_t *sherman_base) {
+    info.states[0].impl_id = 0; /* dead is always 0 */
+
+    vector<dstate_id_t> norm;
+    vector<dstate_id_t> sherm;
+    vector<dstate_id_t> norm_sheng_succ;
+    vector<dstate_id_t> sherm_sheng_succ;
+
+    if (info.size() > (1 << 16)) {
+        DEBUG_PRINTF("too many states\n");
+        *sherman_base = 0;
+        return false;
+    }
+
+    for (u32 i = 1; i < info.size(); i++) {
+        if (info.is_sheng(i)) {
+            continue; /* sheng impl ids have already been allocated */
+        } if (info.is_sherman(i)) {
+            if (info.is_sheng_succ(i)) {
+                sherm_sheng_succ.push_back(i);
+            } else {
+                sherm.push_back(i);
+            }
+        } else {
+            if (info.is_sheng_succ(i)) {
+                norm_sheng_succ.push_back(i);
+            } else {
+                norm.push_back(i);
+            }
+        }
+    }
+
+    dstate_id_t next_norm = sheng_end;
+    for (dstate_id_t s : norm_sheng_succ) {
+        info.states[s].impl_id = next_norm++;
+    }
+    if (next_norm + norm.size() + sherm_sheng_succ.size() > UINT8_MAX) {
+        /* we need to give sheng_succs ids which fit into a u8 -- demote these
+         * to normal states */
+        for (dstate_id_t s : sherm_sheng_succ) {
+            info.states[s].impl_id = next_norm++;
+            info.extra[s].shermanState = false;
+        }
+        sherm_sheng_succ.clear();
+    }
+    for (dstate_id_t s : norm) {
+        info.states[s].impl_id = next_norm++;
+    }
+
+    *sherman_base = next_norm;
+    dstate_id_t next_sherman = next_norm;
+
+    for (dstate_id_t s : sherm_sheng_succ) {
+        info.states[s].impl_id = next_sherman++;
+    }
+
+    for (dstate_id_t s : sherm) {
+        info.states[s].impl_id = next_sherman++;
+    }
+
+    /* Check to see if we haven't over allocated our states */
+    DEBUG_PRINTF("next sherman %u masked %u\n", next_sherman,
+                 (dstate_id_t)(next_sherman & STATE_MASK));
+    return (next_sherman - 1) == ((next_sherman - 1) & STATE_MASK);
+}
+
+typedef RdfaGraph::vertex_descriptor RdfaVertex;
+
+static
+bool mark_sheng_succs(const RdfaGraph &g, dfa_info &info,
+                      const flat_set<RdfaVertex> &sheng_states) {
+    u32 exit_count = 0;
+
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        for (u32 i = 0; i != info.alpha_size; i++) {
+            if (i == info.alpha_remap[TOP]) {
+                continue;
+            }
+            dstate_id_t next = info.states[s].next[i];
+            if (!next || info.is_sheng(next) || info.is_sheng_succ(next)) {
+                continue;
+            }
+            exit_count++;
+            info.extra[next].sheng_succ = true;
+        }
+    }
+
+    if (exit_count + sheng_states.size() < UINT8_MAX) {
+        return true;
+    } else {
+        DEBUG_PRINTF("fail: unable to fit %u exits in byte", exit_count);
+        return false;
+    }
+}
+
+static
+CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
+    CharReach rv;
+    for (u32 i = 0; i < info.impl_alpha_size; i++) {
+        if (info.raw.states[u].next[i] == v) {
+            assert(info.rev_alpha[i].any());
+            rv |= info.rev_alpha[i];
+        }
+    }
+    assert(rv.any());
+    return rv;
+}
+
+#define MAX_SHENG_STATES 16
+#define MAX_SHENG_LEAKINESS 0.05
+
+/**
+ * Returns the proportion of strings of length 'depth' which will leave the
+ * sheng region when starting at state 'u'.
+ */
+static
+double leakiness(const RdfaGraph &g, dfa_info &info,
+                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u,
+                 u32 depth,
+                 unordered_map<pair<RdfaVertex, u32>, double> &cache) {
+    double rv = 0;
+    if (contains(cache, make_pair(u, depth))) {
+        return cache[make_pair(u, depth)];
+    }
+    for (RdfaVertex v : adjacent_vertices_range(u, g)) {
+        if (g[v].index == DEAD_STATE) {
+            continue;
+        }
+        double width = get_edge_reach(g[u].index, g[v].index, info).count();
+        width /= N_CHARS;
+
+        double weight;
+        if (!contains(sheng_states, v)) {
+            weight = 1;
+        } else if (depth > 1) {
+             weight = leakiness(g, info, sheng_states, v, depth - 1, cache);
+        } else {
+            continue; /* weight = 0 */
+        }
+        rv += width * weight;
+    }
+
+    cache[make_pair(u, depth)] = rv;
+    DEBUG_PRINTF("%zu [%u] q = %g\n", g[u].index, depth, rv);
+    return rv;
+}
+
+/**
+ * Returns the proportion of 8 byte strings which will leave the sheng region
+ * when starting at state 'u'.
+ */
+static
+double leakiness(const RdfaGraph &g, dfa_info &info,
+                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u) {
+    unordered_map<pair<RdfaVertex, u32>, double> cache;
+    double rv = leakiness(g, info, sheng_states, u, 8, cache);
+    return rv;
+}
+
+static
+dstate_id_t find_sheng_states(dfa_info &info,
+                             map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    RdfaGraph g(info.raw);
+    auto cyclics = find_vertices_in_cycles(g);
+
+    auto base_cyclic = RdfaGraph::null_vertex();
+    for (const auto &v : cyclics) {
+        if (g[v].index == DEAD_STATE) {
+            continue;
+        }
+        DEBUG_PRINTF("considering cyclic %zu\n", g[v].index);
+        /* get an estimate of stickness of the cyclic: assume any edges from
+         * states with larger state ids are back edges */
+        CharReach est_back_reach;
+        for (const auto &u : inv_adjacent_vertices_range(v, g)) {
+            if (g[u].index < g[v].index) {
+                continue;
+            }
+            est_back_reach |= get_edge_reach(g[u].index, g[v].index, info);
+        }
+
+        if (est_back_reach.count() < 30) {
+            continue;
+        }
+        base_cyclic = v;
+        break;
+    }
+    if (!base_cyclic) {
+        return DEAD_STATE;
+    }
+
+    flat_set<RdfaVertex> sheng_states;
+    deque<RdfaVertex> to_consider = { base_cyclic };
+    flat_set<dstate_id_t> considered = { DEAD_STATE };
+    bool seen_back_edge = false;
+    while (!to_consider.empty()
+           && sheng_states.size() < MAX_SHENG_STATES) {
+        auto v = to_consider.front();
+        to_consider.pop_front();
+        if (!considered.insert(g[v].index).second) {
+            continue;
+        }
+
+        assert(!contains(sheng_states, v));
+
+        if (generates_callbacks(info.raw.kind)
+            && !info.states[g[v].index].reports.empty()) {
+            /* cannot raise callbacks from sheng region */
+            continue;
+        }
+
+        sheng_states.insert(v);
+        for (const auto &t : adjacent_vertices_range(v, g)) {
+            if (!contains(considered, g[t].index)) {
+                to_consider.push_back(t);
+            }
+            if (t == base_cyclic) {
+                seen_back_edge = true;
+            }
+        }
+    }
+
+    /* allocate normal ids */
+    dstate_id_t sheng_end = DEAD_STATE + 1;
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        if (!contains(accel_escape_info, s)) {
+            info.states[s].impl_id = sheng_end++;
+            info.extra[s].sheng_id = info.states[s].impl_id - 1;
+        }
+    }
+
+    /* allocate accel ids */
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        if (contains(accel_escape_info, s)) {
+            assert(!info.states[s].impl_id);
+            info.states[s].impl_id = sheng_end++;
+            info.extra[s].sheng_id = info.states[s].impl_id - 1;
+        }
+    }
+
+    if (sheng_states.size() < MIN_SHENG_SIZE) {
+        DEBUG_PRINTF("sheng region too small\n");
+        return DEAD_STATE;
+    }
+
+    if (!seen_back_edge) {
+        DEBUG_PRINTF("did not include cyclic\n");
+        return DEAD_STATE;
+    }
+
+    double leak = leakiness(g, info, sheng_states, base_cyclic);
+    if (leak > MAX_SHENG_LEAKINESS) {
+        DEBUG_PRINTF("too leaky (%g)\n", leak);
+        return DEAD_STATE;
+    }
+
+    if (!mark_sheng_succs(g, info, sheng_states)) {
+        return DEAD_STATE;
+    }
+
+    /* TODO: ensure sufficiently 'sticky' */
+    /* TODO: check not all states accel */
+    DEBUG_PRINTF("sheng_end = %hu\n", sheng_end);
+    return sheng_end;
+}
+
+static
+void fill_in_aux_info(NFA *nfa, const dfa_info &info,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                      u32 accel_offset, UNUSED u32 accel_end_offset,
+                      const vector<u32> &reports,
+                      const vector<u32> &reports_eod,
+                      u32 report_base_offset,
+                      const raw_report_info &ri) {
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+
+    vector<u32> reportOffsets;
+
+    ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+    for (u32 i = 0; i < info.size(); i++) {
+        u16 impl_id = info.implId(i);
+        mstate_aux *this_aux = getAux(nfa, impl_id);
+
+        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+        if (contains(accel_escape_info, i)) {
+            this_aux->accel_offset = accel_offset;
+            accel_offset += info.strat.accelSize();
+            assert(accel_offset <= accel_end_offset);
+            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
+                                  (void *)((char *)m + this_aux->accel_offset));
+        }
+    }
+}
+
+static
+u16 get_edge_flags(NFA *nfa, dstate_id_t target_impl_id) {
+    mstate_aux *aux = getAux(nfa, target_impl_id);
+    u16 flags = 0;
+
+    if (aux->accept) {
+        flags |= ACCEPT_FLAG;
+    }
+
+    if (aux->accel_offset) {
+        flags |= ACCEL_FLAG;
+    }
+
+    return flags;
+}
+
+static
+void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
+                           dstate_id_t sheng_end,
+                           UNUSED dstate_id_t sherman_base) {
+    u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng));
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end || info.is_sherman(i));
+            continue;
+        }
+
+        assert(info.implId(i) < sherman_base);
+        u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
+
+            entry = info.implId(raw_succ);
+            entry |= get_edge_flags(nfa, entry);
+        }
+    }
+}
+
+#define MAX_SHERMAN_LIST_LEN 8
+
+static
+void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+                  dstate_id_t max) {
+    if (candidate < max) {
+        dest.insert(candidate);
+    }
+}
+
+static
+void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+                   u16 alphasize, dstate_id_t curr_id) {
+    for (symbol_t s = 0; s < alphasize; s++) {
+        addIfEarlier(dest, source.next[s], curr_id);
+    }
+}
+
+#define MAX_SHERMAN_SELF_LOOP 20
+
+static
+void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
+                       bool any_cyclic_near_anchored_state, const Grey &grey) {
+    if (!grey.allowShermanStates) {
+        return;
+    }
+
+    const u16 width = sizeof(u16);
+    const u16 alphasize = info.impl_alpha_size;
+
+    if (info.raw.start_anchored != DEAD_STATE
+        && any_cyclic_near_anchored_state
+        && curr_id < alphasize * 3) {
+        /* crude attempt to prevent frequent states from being sherman'ed
+         * depends on the fact that states are numbers are currently in bfs
+         * order */
+        DEBUG_PRINTF("%hu is banned\n", curr_id);
+        return;
+    }
+
+    if (info.raw.start_floating != DEAD_STATE
+        && curr_id >= info.raw.start_floating
+        && curr_id < info.raw.start_floating + alphasize * 3) {
+        /* crude attempt to prevent frequent states from being sherman'ed
+         * depends on the fact that states are numbers are currently in bfs
+         * order */
+        DEBUG_PRINTF("%hu is banned (%hu)\n", curr_id, info.raw.start_floating);
+        return;
+    }
+
+    const u16 full_state_size = width * alphasize;
+    const u16 max_list_len = MIN(MAX_SHERMAN_LIST_LEN,
+                           (full_state_size - 2)/(width + 1));
+    u16 best_score = 0;
+    dstate_id_t best_daddy = 0;
+    dstate &currState = info.states[curr_id];
+
+    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    dstate_id_t mydaddy = currState.daddy;
+    if (mydaddy) {
+        addIfEarlier(hinted, mydaddy, curr_id);
+        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
+        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
+        if (mygranddaddy) {
+            addIfEarlier(hinted, mygranddaddy, curr_id);
+            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
+                          curr_id);
+        }
+    }
+
+    for (const dstate_id_t &donor : hinted) {
+        assert(donor < curr_id);
+        u32 score = 0;
+
+        if (!info.is_normal(donor)) {
+            continue;
+        }
+
+        const dstate &donorState = info.states[donor];
+        for (symbol_t s = 0; s < alphasize; s++) {
+            if (currState.next[s] == donorState.next[s]) {
+                score++;
+            }
+        }
+
+        /* prefer lower ids to provide some stability amongst potential
+         * siblings */
+        if (score > best_score || (score == best_score && donor < best_daddy)) {
+            best_daddy = donor;
+            best_score = score;
+
+            if (score == alphasize) {
+                break;
+            }
+        }
+    }
+
+    currState.daddy = best_daddy;
+    info.extra[curr_id].daddytaken = best_score;
+    DEBUG_PRINTF("%hu -> daddy %hu: %u/%u BF\n", curr_id, best_daddy,
+                 best_score, alphasize);
+
+    if (best_daddy == DEAD_STATE) {
+        return; /* No good daddy */
+    }
+
+    if (best_score + max_list_len < alphasize) {
+        return; /* ??? */
+    }
+
+    assert(info.is_normal(currState.daddy));
+
+    u32 self_loop_width = 0;
+    const dstate curr_raw = info.states[curr_id];
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
+            self_loop_width++;
+        }
+    }
+
+    if (self_loop_width > MAX_SHERMAN_SELF_LOOP) {
+        DEBUG_PRINTF("%hu is banned wide self loop (%u)\n", curr_id,
+                      self_loop_width);
+        return;
+    }
+
+    if (info.is_sheng(curr_id)) {
+        return;
+    }
+
+    DEBUG_PRINTF("%hu is sherman\n", curr_id);
+    info.extra[curr_id].shermanState = true;
+}
+
+static
+bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
+    symbol_t alphasize = raw.getImplAlphaSize();
+    for (symbol_t s = 0; s < alphasize; s++) {
+        dstate_id_t succ_id = raw.states[root].next[s];
+        if (succ_id == DEAD_STATE) {
+            continue;
+        }
+
+        const dstate &succ = raw.states[succ_id];
+        for (symbol_t t = 0; t < alphasize; t++) {
+            if (succ.next[t] == root || succ.next[t] == succ_id) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static
+void fill_in_sherman(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+    char *nfa_base = (char *)nfa;
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+    char *sherman_table = nfa_base + m->sherman_offset;
+
+    assert(ISALIGNED_16(sherman_table));
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_sherman(i)) {
+            continue;
+        }
+        u16 fs = verify_u16(info.implId(i));
+        DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+        assert(fs >= sherman_limit);
+
+        char *curr_sherman_entry
+            = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+        assert(curr_sherman_entry <= nfa_base + m->length);
+
+        u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+        assert(len <= 9);
+        dstate_id_t d = info.states[i].daddy;
+
+        *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+        *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+        *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+        u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                *(chars++) = (u8)s;
+            }
+        }
+
+        u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+                             info.implId(d),
+                             info.implId(info.states[i].next[s]));
+                u16 entry_val = info.implId(info.states[i].next[s]);
+                entry_val |= get_edge_flags(nfa, entry_val);
+                unaligned_store_u16((u8 *)states++, entry_val);
+            }
+        }
+    }
+}
+
+static
+aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
+                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                        const Grey &grey) {
+    DEBUG_PRINTF("building mcsheng 16\n");
+
+    vector<u32> reports; /* index in ri for the appropriate report list */
+    vector<u32> reports_eod; /* as above */
+    ReportID arb;
+    u8 single;
+
+    assert(info.getAlphaShift() <= 8);
+
+    u16 total_daddy = 0;
+    for (u32 i = 0; i < info.size(); i++) {
+        find_better_daddy(info, i,
+                          is_cyclic_near(info.raw, info.raw.start_anchored),
+                          grey);
+        total_daddy += info.extra[i].daddytaken;
+    }
+
+    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                 info.size() * info.impl_alpha_size, info.size(),
+                 info.impl_alpha_size);
+
+    u16 sherman_limit;
+    if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+                     info.size());
+        return nullptr;
+    }
+    u16 count_real_states = sherman_limit - sheng_end;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+                     * count_real_states;
+
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+    size_t sherman_size = calcShermanRegionSize(info);
+
+    size_t total_size = sherman_offset + sherman_size;
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
+
+    populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
+                      accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks(m, info, sheng_end, accel_escape_info);
+
+    /* copy in the mc header information */
+    m->sherman_offset = sherman_offset;
+    m->sherman_end = total_size;
+    m->sherman_limit = sherman_limit;
+
+    DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+                 count_real_states, info.size());
+
+    fill_in_aux_info(nfa.get(), info, accel_escape_info, accel_offset,
+                     sherman_offset - sizeof(NFA), reports, reports_eod,
+                     aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_16(nfa.get(), info, sheng_end, sherman_limit);
+
+    fill_in_sherman(nfa.get(), info, sherman_limit);
+
+    return nfa;
+}
+
+static
+void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
+                          dstate_id_t sheng_end) {
+    u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng);
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        assert(!info.is_sherman(i));
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end);
+            continue;
+        }
+        u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            succ_table[((size_t)normal_id << alphaShift) + s]
+                = info.implId(raw_succ);
+        }
+    }
+}
+
+static
+void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
+                     const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                     u16 *accel_limit, u16 *accept_limit) {
+    info.states[0].impl_id = 0; /* dead is always 0 */
+
+    vector<dstate_id_t> norm;
+    vector<dstate_id_t> accel;
+    vector<dstate_id_t> accept;
+
+    assert(info.size() <= (1 << 8));
+
+    for (u32 i = 1; i < info.size(); i++) {
+        if (info.is_sheng(i)) {
+            continue; /* already allocated */
+        } else if (!info.states[i].reports.empty()) {
+            accept.push_back(i);
+        } else if (contains(accel_escape_info, i)) {
+            accel.push_back(i);
+        } else {
+            norm.push_back(i);
+        }
+    }
+
+    u32 j = sheng_end;
+    for (const dstate_id_t &s : norm) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n", s, j);
+        info.states[s].impl_id = j++;
+    }
+    *accel_limit = j;
+    for (const dstate_id_t &s : accel) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n", s, j);
+        info.states[s].impl_id = j++;
+    }
+    *accept_limit = j;
+    for (const dstate_id_t &s : accept) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n",  s, j);
+        info.states[s].impl_id = j++;
+    }
+}
+
+static
+aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
+                       const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("building mcsheng 8\n");
+
+    vector<u32> reports;
+    vector<u32> reports_eod;
+    ReportID arb;
+    u8 single;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t normal_count = info.size() - sheng_end;
+
+    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                     + ri->getReportListSize(), 32);
+    size_t total_size = accel_offset + accel_size;
+
+    DEBUG_PRINTF("aux_size %zu\n", aux_size);
+    DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+    DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+    DEBUG_PRINTF("accel_size %zu\n", accel_size);
+    DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+    DEBUG_PRINTF("total_size %zu\n", total_size);
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
+
+    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+                    &m->accept_limit_8);
+
+    populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
+                      accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks(m, info, sheng_end, accel_escape_info);
+
+    fill_in_aux_info(nfa.get(), info, accel_escape_info, accel_offset,
+                     total_size - sizeof(NFA), reports, reports_eod,
+                     aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_8(nfa.get(), info, sheng_end);
+
+    DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+    return nfa;
+}
+
+aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                       const ReportManager &rm) {
+    if (!cc.grey.allowMcSheng) {
+        return nullptr;
+    }
+
+    mcclellan_build_strat mbs(raw, rm);
+    dfa_info info(mbs);
+    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+
+    bool has_eod_reports = raw.hasEodReports();
+
+    map<dstate_id_t, AccelScheme> accel_escape_info
+        = info.strat.getAccelInfo(cc.grey);
+
+    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
+    if (sheng_end <= DEAD_STATE + 1) {
+        return nullptr;
+    }
+
+    aligned_unique_ptr<NFA> nfa;
+    if (!using8bit) {
+        nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
+    } else {
+        nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
+    }
+
+    if (!nfa) {
+        return nfa;
+    }
+
+    if (has_eod_reports) {
+        nfa->flags |= NFA_ACCEPTS_EOD;
+    }
+
+    DEBUG_PRINTF("compile done\n");
+    return nfa;
+}
+
+bool has_accel_mcsheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
+}
+
+} // namespace ue2
diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h
new file mode 100644
index 00000000..d1ae1e32
--- /dev/null
+++ b/src/nfa/mcsheng_compile.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENGCOMPILE_H
+#define MCSHENGCOMPILE_H
+
+#include "accel_dfa_build_strat.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/ue2_containers.h"
+
+#include <memory>
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+struct CompileContext;
+
+ue2::aligned_unique_ptr<NFA>
+mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+               const ReportManager &rm);
+
+bool has_accel_mcsheng(const NFA *nfa);
+
+} // namespace ue2
+
+#endif
diff --git a/src/nfa/mcsheng_data.c b/src/nfa/mcsheng_data.c
new file mode 100644
index 00000000..eaf3cbbb
--- /dev/null
+++ b/src/nfa/mcsheng_data.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng_internal.h"
+
+/* This table is in a separate translation unit from mcsheng.c as we want to
+ * prevent the compiler from seeing these constants. We have the load resources
+ * free at runtime to load the masks with no problems. */
+const u64a mcsheng_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff0f,
+    0x0000000000ff000f,
+    0x00000000ff00000f,
+    0x000000ff0000000f,
+    0x0000ff000000000f,
+    0x00ff00000000000f,
+    0xff0000000000000f,
+};
diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp
new file mode 100644
index 00000000..f5c058af
--- /dev/null
+++ b/src/nfa/mcsheng_dump.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "mcsheng_dump.h"
+
+#include "accel.h"
+#include "accel_dump.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+#include "mcsheng_internal.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/charreach.h"
+#include "util/dump_charclass.h"
+#include "util/dump_util.h"
+#include "util/unaligned.h"
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using namespace std;
+
+namespace ue2 {
+
+static
+const mstate_aux *getAux(const NFA *n, dstate_id_t i) {
+    auto *m = (const mcsheng *)getImplNfa(n);
+    auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+    const mstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void next_states(const NFA *n, u16 s, u16 *t) {
+    const mcsheng *m = (const mcsheng *)getImplNfa(n);
+    const mstate_aux *aux = getAux(n, s);
+    const u32 as = m->alphaShift;
+    assert(s != DEAD_STATE);
+
+    if (s < m->sheng_end) {
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u8 sheng_s = s - 1;
+            auto trans_for_c = (const char *)&m->sheng_masks[c];
+            assert(sheng_s < sizeof(m128));
+            u8 raw_succ = trans_for_c[sheng_s];
+            if (raw_succ == m->sheng_end - 1) {
+                t[c] = DEAD_STATE;
+            } else if (raw_succ < m->sheng_end) {
+                t[c] = raw_succ + 1;
+            } else {
+                t[c] = raw_succ;
+            }
+        }
+    } else  if (n->type == MCSHENG_NFA_8) {
+        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+    } else {
+        u16 base_s = s;
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+        if (s >= m->sherman_limit) {
+            base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+            assert(base_s >= m->sheng_end);
+        }
+
+        const u16 *succ_table = (const u16 *)((const char *)m
+                                              + sizeof(mcsheng));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = base_s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+
+        if (s >= m->sherman_limit) {
+            UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+            assert(type == SHERMAN_STATE);
+            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+            const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+            const u16 *states = (const u16 *)(state_base
+                                              + SHERMAN_STATES_OFFSET(len));
+
+            for (u8 i = 0; i < len; i++) {
+                for (u16 c = 0; c < N_CHARS; c++) {
+                    if (m->remap[c] == chars[i]) {
+                        t[c] = unaligned_load_u16((const u8*)&states[i]);
+                    }
+                }
+            }
+        }
+
+        for (u16 c = 0; c < N_CHARS; c++) {
+            t[c] &= STATE_MASK;
+        }
+
+    }
+
+    t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ ", i, t[s]);
+        if (i < m->sheng_end && t[s] < m->sheng_end) {
+            fprintf(f, "color = red, fontcolor = red ");
+        }
+        fprintf(f, "label = \"");
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
+static
+void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
+    switch(accel->accel_type) {
+    case ACCEL_NONE:
+        break;
+    case ACCEL_VERM:
+    case ACCEL_VERM_NOCASE:
+    case ACCEL_DVERM:
+    case ACCEL_DVERM_NOCASE:
+        fprintf(f, "%u [ color = forestgreen style=diagonals];\n", i);
+        break;
+    case ACCEL_SHUFTI:
+    case ACCEL_DSHUFTI:
+    case ACCEL_TRUFFLE:
+        fprintf(f, "%u [ color = darkgreen style=diagonals ];\n", i);
+        break;
+    default:
+        fprintf(f, "%u [ color = yellow style=diagonals ];\n", i);
+        break;
+    }
+}
+
+static
+void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {
+    const mstate_aux *aux = getAux(n, i);
+
+    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+    if (aux->accel_offset) {
+        dumpAccelDot(f, i, (const union AccelAux *)
+                     ((const char *)m + aux->accel_offset));
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+    }
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && aux->top != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top);
+    }
+
+    if (i == m->start_anchored) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == m->start_floating) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+
+    if (isSherman) {
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+        assert(state_base < (const char *)m + m->length - sizeof(NFA));
+        UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+        assert(type == SHERMAN_STATE);
+        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+        u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+        if (daddy) {
+            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+                    i, daddy);
+        }
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+    }
+
+}
+
+static
+void dumpDotPreambleDfa(FILE *f) {
+    dumpDotPreamble(f);
+
+    // DFA specific additions.
+    fprintf(f, "STARTF [style=invis];\n");
+    fprintf(f, "STARTA [style=invis];\n");
+    fprintf(f, "0 [style=invis];\n");
+    fprintf(f, "subgraph cluster_sheng { style = dashed }\n");
+}
+
+static
+void dump_dot_16(const NFA *nfa, FILE *f) {
+    auto  *m = (const mcsheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states(nfa, i, t);
+
+        describeEdge(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dump_dot_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states(nfa, i, t);
+
+        describeEdge(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks(FILE *f, const mcsheng *m, const mstate_aux *aux) {
+    fprintf(f, "\n");
+    fprintf(f, "Acceleration\n");
+    fprintf(f, "------------\n");
+
+    for (u16 i = 0; i < m->state_count; i++) {
+        if (!aux[i].accel_offset) {
+            continue;
+        }
+
+        auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+        fprintf(f, "%05hu ", i);
+        dumpAccelInfo(f, *accel);
+    }
+}
+
+static
+void describeAlphabet(FILE *f, const mcsheng *m) {
+    map<u8, CharReach> rev;
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].clear();
+    }
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].set(i);
+    }
+
+    map<u8, CharReach>::const_iterator it;
+    fprintf(f, "\nAlphabet\n");
+    for (it = rev.begin(); it != rev.end(); ++it) {
+        fprintf(f, "%3hhu: ", it->first);
+        describeClass(f, it->second, 10240, CC_OUT_TEXT);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader(FILE *f, const mcsheng *m) {
+    fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+            m->state_count, m->length);
+    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+            m->start_floating);
+    fprintf(f, "single accept: %d, has_accel: %d\n",
+            !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+    fprintf(f, "sheng_end:         %hu\n", m->sheng_end);
+    fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump_text_16(const NFA *nfa, FILE *f) {
+    auto *m = (const mcsheng *)getImplNfa(nfa);
+    auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 16\n");
+    dumpCommonHeader(f, m);
+    fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+            (int)m->sherman_end);
+    fprintf(f, "\n");
+
+    describeAlphabet(f, m);
+    dumpAccelMasks(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+static
+void dump_text_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng *)getImplNfa(nfa);
+    auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 8\n");
+    dumpCommonHeader(f, m);
+    fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+            m->accept_limit_8);
+    fprintf(f, "\n");
+
+    describeAlphabet(f, m);
+    dumpAccelMasks(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCSHENG_NFA_16);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    dump_text_16(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    dump_dot_16(nfa, f);
+    fclose(f);
+}
+
+void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCSHENG_NFA_8);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    dump_text_8(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    dump_dot_8(nfa, f);
+    fclose(f);
+}
+
+} // namespace ue2
diff --git a/src/nfa/mcsheng_dump.h b/src/nfa/mcsheng_dump.h
new file mode 100644
index 00000000..1b699367
--- /dev/null
+++ b/src/nfa/mcsheng_dump.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_DUMP_H
+#define MCSHENG_DUMP_H
+
+#ifdef DUMP_SUPPORT
+
+#include "rdfa.h"
+
+#include <cstdio>
+#include <string>
+
+struct NFA;
+
+namespace ue2 {
+
+void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif // MCSHENG_DUMP_H
diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h
new file mode 100644
index 00000000..81a658e0
--- /dev/null
+++ b/src/nfa/mcsheng_internal.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_INTERNAL_H
+#define MCSHENG_INTERNAL_H
+
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/simd_types.h"
+
+#define ACCEPT_FLAG 0x8000
+#define ACCEL_FLAG  0x4000
+#define STATE_MASK  0x3fff
+
+#define SHERMAN_STATE 1
+
+#define SHERMAN_TYPE_OFFSET            0
+#define SHERMAN_FIXED_SIZE            32
+
+#define SHERMAN_LEN_OFFSET             1
+#define SHERMAN_DADDY_OFFSET           2
+#define SHERMAN_CHARS_OFFSET           4
+#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len))
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct mstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u16 top;
+    u32 accel_offset; /* relative to start of struct mcsheng; 0 if no accel */
+};
+
+#define MCSHENG_FLAG_SINGLE 1  /**< we raise only single accept id */
+
+struct mcsheng {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of the accel structures from start of NFA */
+    m128 sheng_masks[N_CHARS];
+};
+
+/* pext masks for the runtime to access appropriately copies of bytes 1..7
+ * representing the data from a u64a. */
+extern const u64a mcsheng_pext_mask[8];
+
+#endif
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index c6c8cb88..552754d6 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -825,21 +825,21 @@ void mpvStoreState(const struct NFA *n, char *state,
     }
 }
 
-char nfaExecMpv0_queueCompressState(const struct NFA *nfa, const struct mq *q,
-                                    UNUSED s64a loc) {
+char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                   UNUSED s64a loc) {
     void *dest = q->streamState;
     const void *src = q->state;
     mpvStoreState(nfa, dest, src);
     return 0;
 }
 
-char nfaExecMpv0_expandState(const struct NFA *nfa, void *dest, const void *src,
-                             UNUSED u64a offset, UNUSED u8 key) {
+char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src,
+                            UNUSED u64a offset, UNUSED u8 key) {
     mpvLoadState(dest, nfa, src);
     return 0;
 }
 
-char nfaExecMpv0_reportCurrent(const struct NFA *n, struct mq *q) {
+char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mpv *m = getImplNfa(n);
     u64a offset = q_cur_offset(q);
     struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state;
@@ -855,7 +855,7 @@ char nfaExecMpv0_reportCurrent(const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecMpv0_queueInitState(const struct NFA *n, struct mq *q) {
+char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q) {
     struct mpv_decomp_state *out = (void *)q->state;
     const struct mpv *m = getImplNfa(n);
     assert(sizeof(*out) <= n->scratchStateSize);
@@ -880,8 +880,8 @@ char nfaExecMpv0_queueInitState(const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecMpv0_initCompressedState(const struct NFA *n, u64a offset,
-                                     void *state, UNUSED u8 key) {
+char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset,
+                                    void *state, UNUSED u8 key) {
     const struct mpv *m = getImplNfa(n);
     memset(state, 0, m->active_offset); /* active_offset marks end of comp
                                          * counters */
@@ -896,7 +896,7 @@ char nfaExecMpv0_initCompressedState(const struct NFA *n, u64a offset,
 }
 
 static really_inline
-char nfaExecMpv0_Q_i(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecMpv_Q_i(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
     size_t length = q->length;
@@ -1021,18 +1021,18 @@ char nfaExecMpv0_Q_i(const struct NFA *n, struct mq *q, s64a end) {
     return alive;
 }
 
-char nfaExecMpv0_Q(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end) {
     DEBUG_PRINTF("_Q %lld\n", end);
-    return nfaExecMpv0_Q_i(n, q, end);
+    return nfaExecMpv_Q_i(n, q, end);
 }
 
-s64a nfaExecMpv0_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end) {
+s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end) {
     DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end);
 #ifdef DEBUG
     debugQueue(q);
 #endif
 
-    assert(nfa->type == MPV_NFA_0);
+    assert(nfa->type == MPV_NFA);
     assert(q && q->context && q->state);
     assert(end >= 0);
     assert(q->cur < q->end);
@@ -1058,7 +1058,7 @@ s64a nfaExecMpv0_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end) {
     /* TODO: restore max offset stuff, if/when _interesting_ max offset stuff
      * is filled in */
 
-    char rv = nfaExecMpv0_Q_i(nfa, q, end);
+    char rv = nfaExecMpv_Q_i(nfa, q, end);
 
     assert(!q->report_current);
     DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed);
diff --git a/src/nfa/mpv.h b/src/nfa/mpv.h
index a3f90719..3780728d 100644
--- a/src/nfa/mpv.h
+++ b/src/nfa/mpv.h
@@ -34,27 +34,27 @@
 struct mq;
 struct NFA;
 
-char nfaExecMpv0_Q(const struct NFA *n, struct mq *q, s64a end);
-char nfaExecMpv0_reportCurrent(const struct NFA *n, struct mq *q);
-char nfaExecMpv0_queueInitState(const struct NFA *n, struct mq *q);
-char nfaExecMpv0_initCompressedState(const struct NFA *n, u64a offset,
-                                     void *state, u8 key);
-char nfaExecMpv0_queueCompressState(const struct NFA *nfa, const struct mq *q,
-                                    s64a loc);
-char nfaExecMpv0_expandState(const struct NFA *nfa, void *dest, const void *src,
-                             u64a offset, u8 key);
+char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset,
+                                    void *state, u8 key);
+char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                   s64a loc);
+char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src,
+                            u64a offset, u8 key);
 
-#define nfaExecMpv0_testEOD NFA_API_NO_IMPL
-#define nfaExecMpv0_inAccept NFA_API_NO_IMPL
-#define nfaExecMpv0_inAnyAccept NFA_API_NO_IMPL
-#define nfaExecMpv0_QR NFA_API_NO_IMPL
-#define nfaExecMpv0_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */
-#define nfaExecMpv0_B_Reverse NFA_API_NO_IMPL
-#define nfaExecMpv0_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMpv_testEOD NFA_API_NO_IMPL
+#define nfaExecMpv_inAccept NFA_API_NO_IMPL
+#define nfaExecMpv_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMpv_QR NFA_API_NO_IMPL
+#define nfaExecMpv_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */
+#define nfaExecMpv_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMpv_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
 /**
  * return 0 if the mpv dies, otherwise returns the location of the next possible
  * match (given the currently known events). */
-s64a nfaExecMpv0_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end);
+s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end);
 
 #endif
diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index da21d7cf..9a8a4067 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -36,6 +36,7 @@
 #include "ue2common.h"
 #include "util/compare.h"
 #include "util/dump_mask.h"
+#include "util/dump_util.h"
 
 #include <cstdlib>
 #include <cstdio>
@@ -46,11 +47,11 @@
 #error No dump support!
 #endif
 
-namespace ue2 {
+/* Note: No dot files for MPV */
 
-void nfaExecMpv0_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *file,
-                         UNUSED const std::string &base) {
-}
+using namespace std;
+
+namespace ue2 {
 
 static really_inline
 u32 largest_puff_repeat(const mpv *m, const mpv_kilopuff *kp) {
@@ -128,9 +129,11 @@ void dumpCounter(FILE *f, const mpv_counter_info *c) {
     fprintf(f, "\n");
 }
 
-void nfaExecMpv0_dumpText(const NFA *nfa, FILE *f) {
+void nfaExecMpv_dump(const NFA *nfa, const string &base) {
     const mpv *m = (const mpv *)getImplNfa(nfa);
 
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+
     fprintf(f, "Puff the Magic Engines\n");
     fprintf(f, "\n");
     fprintf(f, "%u puffettes in %u kilopuffs\n", m->puffette_count,
@@ -151,6 +154,7 @@ void nfaExecMpv0_dumpText(const NFA *nfa, FILE *f) {
     }
 
     dumpTextReverse(nfa, f);
+    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/mpv_dump.h b/src/nfa/mpv_dump.h
index 23910dce..e587619e 100644
--- a/src/nfa/mpv_dump.h
+++ b/src/nfa/mpv_dump.h
@@ -31,16 +31,13 @@
 
 #if defined(DUMP_SUPPORT)
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecMpv0_dumpDot(const struct NFA *nfa, FILE *file,
-                         const std::string &base);
-void nfaExecMpv0_dumpText(const struct NFA *nfa, FILE *file);
+void nfaExecMpv_dump(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/mpvcompile.cpp b/src/nfa/mpvcompile.cpp
index b024b530..87fb462e 100644
--- a/src/nfa/mpvcompile.cpp
+++ b/src/nfa/mpvcompile.cpp
@@ -34,7 +34,7 @@
 #include "shufticompile.h"
 #include "trufflecompile.h"
 #include "util/alloc.h"
-#include "util/multibit_internal.h"
+#include "util/multibit_build.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
@@ -175,12 +175,13 @@ void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
         size_t set = reach.find_first();
         assert(set != CharReach::npos);
         kp->u.verm.c = (char)set;
-    } else if (shuftiBuildMasks(~reach, &kp->u.shuf.mask_lo,
-                                &kp->u.shuf.mask_hi) != -1) {
+    } else if (shuftiBuildMasks(~reach, (u8 *)&kp->u.shuf.mask_lo,
+                                (u8 *)&kp->u.shuf.mask_hi) != -1) {
         kp->type = MPV_SHUFTI;
     } else {
         kp->type = MPV_TRUFFLE;
-        truffleBuildMasks(~reach, &kp->u.truffle.mask1, &kp->u.truffle.mask2);
+        truffleBuildMasks(~reach, (u8 *)&kp->u.truffle.mask1,
+                          (u8 *)&kp->u.truffle.mask2);
     }
 
     kp->count = verify_u32(puffs.size());
@@ -207,7 +208,7 @@ void writeCoreNfa(NFA *nfa, u32 len, u32 min_width, u32 max_counter,
 
     nfa->length = len;
     nfa->nPositions = max_counter - 1;
-    nfa->type = MPV_NFA_0;
+    nfa->type = MPV_NFA;
     nfa->streamStateSize = streamStateSize;
     assert(16 >= sizeof(mpv_decomp_kilo));
     nfa->scratchStateSize = scratchStateSize;
diff --git a/src/nfa/multishufti.h b/src/nfa/multishufti.h
index bcccf607..af578483 100644
--- a/src/nfa/multishufti.h
+++ b/src/nfa/multishufti.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,7 @@
 #define MULTISHUFTI_H
 
 #include "ue2common.h"
-#include "util/simd_utils.h"
+#include "util/simd_types.h"
 
 #ifdef __cplusplus
 extern "C"
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index c67103b3..f4b7552e 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -41,39 +41,43 @@
 #include "lbr.h"
 #include "limex.h"
 #include "mcclellan.h"
+#include "mcsheng.h"
 #include "mpv.h"
 #include "sheng.h"
 #include "tamarama.h"
 
-#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_subtype, dc_func_call) \
-    case dc_ltype##_NFA_##dc_subtype:                               \
-    return nfaExec##dc_ftype##dc_subtype##dc_func_call;             \
+#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_func_call)                        \
+    case dc_ltype:                                                             \
+        return nfaExec##dc_ftype##dc_func_call;                                \
     break
 
 // general framework calls
 
-#define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
-    switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
-        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
-        DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
-        DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
-        DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
-        DISPATCH_CASE(GOUGH, Gough, 16, dbnt_func);           \
-        DISPATCH_CASE(MPV, Mpv, 0, dbnt_func);                \
-        DISPATCH_CASE(LBR, Lbr, Dot, dbnt_func);              \
-        DISPATCH_CASE(LBR, Lbr, Verm, dbnt_func);             \
-        DISPATCH_CASE(LBR, Lbr, NVerm, dbnt_func);            \
-        DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
-        DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
-        DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
-        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
-        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
-    default:                                                  \
-        assert(0);                                            \
+#define DISPATCH_BY_NFA_TYPE(dbnt_func)                                        \
+    switch (nfa->type) {                                                       \
+        DISPATCH_CASE(LIMEX_NFA_32, LimEx32, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_64, LimEx64, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_128, LimEx128, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_256, LimEx256, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_384, LimEx384, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_512, LimEx512, dbnt_func);                     \
+        DISPATCH_CASE(MCCLELLAN_NFA_8, McClellan8, dbnt_func);                 \
+        DISPATCH_CASE(MCCLELLAN_NFA_16, McClellan16, dbnt_func);               \
+        DISPATCH_CASE(GOUGH_NFA_8, Gough8, dbnt_func);                         \
+        DISPATCH_CASE(GOUGH_NFA_16, Gough16, dbnt_func);                       \
+        DISPATCH_CASE(MPV_NFA, Mpv, dbnt_func);                                \
+        DISPATCH_CASE(LBR_NFA_DOT, LbrDot, dbnt_func);                         \
+        DISPATCH_CASE(LBR_NFA_VERM, LbrVerm, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_NVERM, LbrNVerm, dbnt_func);                     \
+        DISPATCH_CASE(LBR_NFA_SHUF, LbrShuf, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_TRUF, LbrTruf, dbnt_func);                       \
+        DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func);                          \
+        DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func);                            \
+        DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
+        DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
+        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+    default:                                                                   \
+        assert(0);                                                             \
     }
 
 char nfaCheckFinalState(const struct NFA *nfa, const char *state,
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 93376b01..3103cd29 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -30,6 +30,7 @@
 
 #include "limex_internal.h"
 #include "mcclellancompile.h"
+#include "mcsheng_compile.h"
 #include "shengcompile.h"
 #include "nfa_internal.h"
 #include "repeat_internal.h"
@@ -170,17 +171,16 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
 #define DO_IF_DUMP_SUPPORT(a)
 #endif
 
-#define MAKE_LIMEX_TRAITS(mlt_size)                                     \
+#define MAKE_LIMEX_TRAITS(mlt_size, mlt_align)                          \
     template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                 \
         static UNUSED const char *name;                                 \
         static const NFACategory category = NFA_LIMEX;                  \
         typedef LimExNFA##mlt_size implNFA_t;                           \
-        typedef u_##mlt_size tableRow_t;                                \
         static const nfa_dispatch_fn has_accel;                         \
         static const nfa_dispatch_fn has_repeats;                       \
         static const nfa_dispatch_fn has_repeats_other_than_firsts;     \
         static const u32 stateAlign =                                   \
-                MAX(alignof(tableRow_t), alignof(RepeatControl));       \
+                MAX(mlt_align, alignof(RepeatControl));                 \
         static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
     const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel    \
@@ -194,16 +194,17 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
     const char *NFATraits<LIMEX_NFA_##mlt_size>::name                   \
         = "LimEx "#mlt_size;                                            \
     template<> struct getDescription<LIMEX_NFA_##mlt_size> {            \
-        static string call(const void *ptr) {                           \
-            return getDescriptionLimEx<LIMEX_NFA_##mlt_size>((const NFA *)ptr); \
-        } \
+        static string call(const void *p) {                             \
+            return getDescriptionLimEx<LIMEX_NFA_##mlt_size>((const NFA *)p); \
+        }                                                               \
     };)
 
-MAKE_LIMEX_TRAITS(32)
-MAKE_LIMEX_TRAITS(128)
-MAKE_LIMEX_TRAITS(256)
-MAKE_LIMEX_TRAITS(384)
-MAKE_LIMEX_TRAITS(512)
+MAKE_LIMEX_TRAITS(32,  alignof(u32))
+MAKE_LIMEX_TRAITS(64,  alignof(m128)) /* special, 32bit arch uses m128 */
+MAKE_LIMEX_TRAITS(128, alignof(m128))
+MAKE_LIMEX_TRAITS(256, alignof(m256))
+MAKE_LIMEX_TRAITS(384, alignof(m384))
+MAKE_LIMEX_TRAITS(512, alignof(m512))
 
 template<> struct NFATraits<MCCLELLAN_NFA_8> {
     UNUSED static const char *name;
@@ -269,7 +270,7 @@ const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats_other_than_firsts = d
 const char *NFATraits<GOUGH_NFA_16>::name = "Goughfish 16";
 #endif
 
-template<> struct NFATraits<MPV_NFA_0> {
+template<> struct NFATraits<MPV_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -278,14 +279,14 @@ template<> struct NFATraits<MPV_NFA_0> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<MPV_NFA_0>::name = "Mega-Puff-Vac";
+const char *NFATraits<MPV_NFA>::name = "Mega-Puff-Vac";
 #endif
 
-template<> struct NFATraits<CASTLE_NFA_0> {
+template<> struct NFATraits<CASTLE_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -294,14 +295,14 @@ template<> struct NFATraits<CASTLE_NFA_0> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<CASTLE_NFA_0>::name = "Castle";
+const char *NFATraits<CASTLE_NFA>::name = "Castle";
 #endif
 
-template<> struct NFATraits<LBR_NFA_Dot> {
+template<> struct NFATraits<LBR_NFA_DOT> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -310,14 +311,14 @@ template<> struct NFATraits<LBR_NFA_Dot> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_DOT>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_DOT>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_DOT>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<LBR_NFA_Dot>::name = "Lim Bounded Repeat (D)";
+const char *NFATraits<LBR_NFA_DOT>::name = "Lim Bounded Repeat (D)";
 #endif
 
-template<> struct NFATraits<LBR_NFA_Verm> {
+template<> struct NFATraits<LBR_NFA_VERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -326,14 +327,14 @@ template<> struct NFATraits<LBR_NFA_Verm> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<LBR_NFA_Verm>::name = "Lim Bounded Repeat (V)";
+const char *NFATraits<LBR_NFA_VERM>::name = "Lim Bounded Repeat (V)";
 #endif
 
-template<> struct NFATraits<LBR_NFA_NVerm> {
+template<> struct NFATraits<LBR_NFA_NVERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -342,14 +343,14 @@ template<> struct NFATraits<LBR_NFA_NVerm> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<LBR_NFA_NVerm>::name = "Lim Bounded Repeat (NV)";
+const char *NFATraits<LBR_NFA_NVERM>::name = "Lim Bounded Repeat (NV)";
 #endif
 
-template<> struct NFATraits<LBR_NFA_Shuf> {
+template<> struct NFATraits<LBR_NFA_SHUF> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -358,14 +359,14 @@ template<> struct NFATraits<LBR_NFA_Shuf> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<LBR_NFA_Shuf>::name = "Lim Bounded Repeat (S)";
+const char *NFATraits<LBR_NFA_SHUF>::name = "Lim Bounded Repeat (S)";
 #endif
 
-template<> struct NFATraits<LBR_NFA_Truf> {
+template<> struct NFATraits<LBR_NFA_TRUF> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -374,14 +375,14 @@ template<> struct NFATraits<LBR_NFA_Truf> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_TRUF>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_TRUF>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_TRUF>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<LBR_NFA_Truf>::name = "Lim Bounded Repeat (M)";
+const char *NFATraits<LBR_NFA_TRUF>::name = "Lim Bounded Repeat (M)";
 #endif
 
-template<> struct NFATraits<SHENG_NFA_0> {
+template<> struct NFATraits<SHENG_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
@@ -390,14 +391,14 @@ template<> struct NFATraits<SHENG_NFA_0> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_accel = has_accel_sheng;
-const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<SHENG_NFA_0>::name = "Sheng";
+const char *NFATraits<SHENG_NFA>::name = "Sheng";
 #endif
 
-template<> struct NFATraits<TAMARAMA_NFA_0> {
+template<> struct NFATraits<TAMARAMA_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 32;
@@ -406,11 +407,43 @@ template<> struct NFATraits<TAMARAMA_NFA_0> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_accel = dispatch_false;
-const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats = dispatch_false;
-const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
-const char *NFATraits<TAMARAMA_NFA_0>::name = "Tamarama";
+const char *NFATraits<TAMARAMA_NFA>::name = "Tamarama";
+#endif
+
+template<> struct NFATraits<MCSHENG_NFA_8> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_NFA_8>::name = "Shengy McShengFace 8";
+#endif
+
+template<> struct NFATraits<MCSHENG_NFA_16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 2;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16";
 #endif
 
 } // namespace
diff --git a/src/nfa/nfa_dump_api.h b/src/nfa/nfa_dump_api.h
index 1054a204..a0c4a9c9 100644
--- a/src/nfa/nfa_dump_api.h
+++ b/src/nfa/nfa_dump_api.h
@@ -35,7 +35,6 @@
 
 #if defined(DUMP_SUPPORT)
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
@@ -43,13 +42,11 @@ struct NFA;
 namespace ue2 {
 
 /**
- * \brief Dump (in Graphviz 'dot' format) a representation of the NFA into the
- * file pointed to by dotFile.
+ * \brief Dump files representing the engine. All files dumped should begin with
+ * path/prefix specified by base. Generally a text file and a grpahviz (dot)
+ * files should be produced.
  */
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile, const std::string &base);
-
-/** \brief Dump a textual representation of the NFA. */
-void nfaDumpText(const struct NFA *fact, FILE *textFile);
+void nfaGenerateDumpFiles(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 388ac003..5607ed27 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -39,6 +39,7 @@
 #include "lbr_dump.h"
 #include "limex.h"
 #include "mcclellandump.h"
+#include "mcsheng_dump.h"
 #include "mpv_dump.h"
 #include "shengdump.h"
 #include "tamarama_dump.h"
@@ -49,45 +50,43 @@
 
 namespace ue2 {
 
-#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_subtype, dc_func_call) \
-    case dc_ltype##_NFA_##dc_subtype:                               \
-    nfaExec##dc_ftype##dc_subtype##dc_func_call;                    \
+#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_func_call)                        \
+    case dc_ltype:                                                             \
+        nfaExec##dc_ftype##dc_func_call;                                       \
     break
 
 // general framework calls
 
-#define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
-    DEBUG_PRINTF("dispatch for NFA type %u\n", nfa->type);    \
-    switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
-        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
-        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
-        DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
-        DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
-        DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
-        DISPATCH_CASE(GOUGH, Gough, 16, dbnt_func);           \
-        DISPATCH_CASE(MPV, Mpv, 0, dbnt_func);                \
-        DISPATCH_CASE(LBR, Lbr, Dot, dbnt_func);              \
-        DISPATCH_CASE(LBR, Lbr, Verm, dbnt_func);             \
-        DISPATCH_CASE(LBR, Lbr, NVerm, dbnt_func);            \
-        DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
-        DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
-        DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
-        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
-        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
-    default:                                                  \
-        assert(0);                                            \
+#define DISPATCH_BY_NFA_TYPE(dbnt_func)                                        \
+    DEBUG_PRINTF("dispatch for NFA type %u\n", nfa->type);                     \
+    switch (nfa->type) {                                                       \
+        DISPATCH_CASE(LIMEX_NFA_32, LimEx32, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_64, LimEx64, dbnt_func);                       \
+        DISPATCH_CASE(LIMEX_NFA_128, LimEx128, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_256, LimEx256, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_384, LimEx384, dbnt_func);                     \
+        DISPATCH_CASE(LIMEX_NFA_512, LimEx512, dbnt_func);                     \
+        DISPATCH_CASE(MCCLELLAN_NFA_8, McClellan8, dbnt_func);                 \
+        DISPATCH_CASE(MCCLELLAN_NFA_16, McClellan16, dbnt_func);               \
+        DISPATCH_CASE(GOUGH_NFA_8, Gough8, dbnt_func);                         \
+        DISPATCH_CASE(GOUGH_NFA_16, Gough16, dbnt_func);                       \
+        DISPATCH_CASE(MPV_NFA, Mpv, dbnt_func);                                \
+        DISPATCH_CASE(LBR_NFA_DOT, LbrDot, dbnt_func);                         \
+        DISPATCH_CASE(LBR_NFA_VERM, LbrVerm, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_NVERM, LbrNVerm, dbnt_func);                     \
+        DISPATCH_CASE(LBR_NFA_SHUF, LbrShuf, dbnt_func);                       \
+        DISPATCH_CASE(LBR_NFA_TRUF, LbrTruf, dbnt_func);                       \
+        DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func);                          \
+        DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func);                            \
+        DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
+        DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
+        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+    default:                                                                   \
+        assert(0);                                                             \
     }
 
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile,
-                const std::string &base) {
-    DISPATCH_BY_NFA_TYPE(_dumpDot(nfa, dotFile, base));
-}
-
-void nfaDumpText(const struct NFA *nfa, FILE *txtFile) {
-    DISPATCH_BY_NFA_TYPE(_dumpText(nfa, txtFile));
+void nfaGenerateDumpFiles(const struct NFA *nfa, const std::string &base) {
+    DISPATCH_BY_NFA_TYPE(_dump(nfa, base));
 }
 
 } // namespace ue2
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 41fee73e..9d280822 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -52,6 +52,7 @@ extern "C"
 
 enum NFAEngineType {
     LIMEX_NFA_32,
+    LIMEX_NFA_64,
     LIMEX_NFA_128,
     LIMEX_NFA_256,
     LIMEX_NFA_384,
@@ -60,15 +61,17 @@ enum NFAEngineType {
     MCCLELLAN_NFA_16,   /**< magic pseudo nfa */
     GOUGH_NFA_8,        /**< magic pseudo nfa */
     GOUGH_NFA_16,       /**< magic pseudo nfa */
-    MPV_NFA_0,          /**< magic pseudo nfa */
-    LBR_NFA_Dot,        /**< magic pseudo nfa */
-    LBR_NFA_Verm,       /**< magic pseudo nfa */
-    LBR_NFA_NVerm,      /**< magic pseudo nfa */
-    LBR_NFA_Shuf,       /**< magic pseudo nfa */
-    LBR_NFA_Truf,       /**< magic pseudo nfa */
-    CASTLE_NFA_0,       /**< magic pseudo nfa */
-    SHENG_NFA_0,        /**< magic pseudo nfa */
-    TAMARAMA_NFA_0,     /**< magic nfa container */
+    MPV_NFA,            /**< magic pseudo nfa */
+    LBR_NFA_DOT,        /**< magic pseudo nfa */
+    LBR_NFA_VERM,       /**< magic pseudo nfa */
+    LBR_NFA_NVERM,      /**< magic pseudo nfa */
+    LBR_NFA_SHUF,       /**< magic pseudo nfa */
+    LBR_NFA_TRUF,       /**< magic pseudo nfa */
+    CASTLE_NFA,         /**< magic pseudo nfa */
+    SHENG_NFA,          /**< magic pseudo nfa */
+    TAMARAMA_NFA,       /**< magic nfa container */
+    MCSHENG_NFA_8,      /**< magic pseudo nfa */
+    MCSHENG_NFA_16,     /**< magic pseudo nfa */
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -142,6 +145,12 @@ static really_inline int isMcClellanType(u8 t) {
     return t == MCCLELLAN_NFA_8 || t == MCCLELLAN_NFA_16;
 }
 
+/** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
+ * DFA. */
+static really_inline int isShengMcClellanType(u8 t) {
+    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16;
+}
+
 /** \brief True if the given type (from NFA::type) is a Gough DFA. */
 static really_inline int isGoughType(u8 t) {
     return t == GOUGH_NFA_8 || t == GOUGH_NFA_16;
@@ -149,7 +158,7 @@ static really_inline int isGoughType(u8 t) {
 
 /** \brief True if the given type (from NFA::type) is a Sheng DFA. */
 static really_inline int isShengType(u8 t) {
-    return t == SHENG_NFA_0;
+    return t == SHENG_NFA;
 }
 
 /**
@@ -157,13 +166,23 @@ static really_inline int isShengType(u8 t) {
  * Sheng DFA.
  */
 static really_inline int isDfaType(u8 t) {
-    return isMcClellanType(t) || isGoughType(t) || isShengType(t);
+    return isMcClellanType(t) || isGoughType(t) || isShengType(t)
+        || isShengMcClellanType(t);
+}
+
+static really_inline int isBigDfaType(u8 t) {
+    return t == MCCLELLAN_NFA_16 || t == MCSHENG_NFA_16 || t == GOUGH_NFA_16;
+}
+
+static really_inline int isSmallDfaType(u8 t) {
+    return isDfaType(t) && !isBigDfaType(t);
 }
 
 /** \brief True if the given type (from NFA::type) is an NFA. */
 static really_inline int isNfaType(u8 t) {
     switch (t) {
     case LIMEX_NFA_32:
+    case LIMEX_NFA_64:
     case LIMEX_NFA_128:
     case LIMEX_NFA_256:
     case LIMEX_NFA_384:
@@ -178,14 +197,14 @@ static really_inline int isNfaType(u8 t) {
 /** \brief True if the given type (from NFA::type) is an LBR. */
 static really_inline
 int isLbrType(u8 t) {
-    return t == LBR_NFA_Dot || t == LBR_NFA_Verm || t == LBR_NFA_NVerm ||
-           t == LBR_NFA_Shuf || t == LBR_NFA_Truf;
+    return t == LBR_NFA_DOT || t == LBR_NFA_VERM || t == LBR_NFA_NVERM ||
+           t == LBR_NFA_SHUF || t == LBR_NFA_TRUF;
 }
 
 /** \brief True if the given type (from NFA::type) is a container engine. */
 static really_inline
 int isContainerType(u8 t) {
-    return t == TAMARAMA_NFA_0;
+    return t == TAMARAMA_NFA;
 }
 
 static really_inline
@@ -200,14 +219,14 @@ int isMultiTopType(u8 t) {
 /* Use for functions that return an integer. */
 #define NFA_API_NO_IMPL(...)                                                   \
     ({                                                                         \
-        assert(!"not implemented for this engine!");                            \
+        assert(!"not implemented for this engine!");                           \
         0; /* return value, for places that need it */                         \
     })
 
 /* Use for _zombie_status functions. */
 #define NFA_API_ZOMBIE_NO_IMPL(...)                                            \
     ({                                                                         \
-        assert(!"not implemented for this engine!");                            \
+        assert(!"not implemented for this engine!");                           \
         NFA_ZOMBIE_NO;                                                         \
     })
 
diff --git a/src/nfa/rdfa_graph.cpp b/src/nfa/rdfa_graph.cpp
new file mode 100644
index 00000000..2467748b
--- /dev/null
+++ b/src/nfa/rdfa_graph.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "rdfa_graph.h"
+
+#include "rdfa.h"
+#include "util/container.h"
+
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+RdfaGraph::RdfaGraph(const raw_dfa &rdfa) {
+    RdfaGraph &g = *this;
+
+    vector<RdfaGraph::vertex_descriptor> verts;
+    verts.reserve(rdfa.states.size());
+    for (dstate_id_t i = 0; i < rdfa.states.size(); i++) {
+        verts.push_back(add_vertex(g));
+        assert(g[verts.back()].index == i);
+    }
+
+    symbol_t symbol_end = rdfa.alpha_size - 1;
+
+    flat_set<dstate_id_t> local_succs;
+    for (dstate_id_t i = 0; i < rdfa.states.size(); i++) {
+        local_succs.clear();
+        for (symbol_t s = 0; s < symbol_end; s++) {
+            dstate_id_t next = rdfa.states[i].next[s];
+            if (contains(local_succs, next)) {
+                continue;
+            }
+            DEBUG_PRINTF("%hu->%hu\n", i, next);
+            add_edge(verts[i], verts[next], g);
+            local_succs.insert(next);
+        }
+    }
+}
+
+}
diff --git a/src/nfa/rdfa_graph.h b/src/nfa/rdfa_graph.h
new file mode 100644
index 00000000..6d166c2f
--- /dev/null
+++ b/src/nfa/rdfa_graph.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RDFA_GRAPH_H
+#define RDFA_GRAPH_H
+
+#include "ue2common.h"
+#include "util/ue2_graph.h"
+
+namespace ue2 {
+
+struct raw_dfa;
+
+struct RdfaVertexProps {
+    size_t index = 0;
+};
+
+struct RdfaEdgeProps {
+    size_t index = 0;
+};
+
+struct RdfaGraph : public ue2_graph<RdfaGraph, RdfaVertexProps, RdfaEdgeProps> {
+    RdfaGraph(const raw_dfa &rdfa);
+};
+
+
+}
+
+#endif
diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 2e1010bb..934dd29e 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
 #include "util/charreach.h"
 #include "util/depth.h"
 #include "util/dump_charclass.h"
-#include "util/multibit_internal.h"
+#include "util/multibit_build.h"
 #include "util/verify_types.h"
 
 #include <algorithm>
diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index bbbf1f20..837aa7df 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -405,9 +405,7 @@ char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
             const u8 * scanned = cur_buf;
             char rv;
 
-            /* if we're in nomatch mode or if we're scanning history buffer */
-            if (mode == NO_MATCHES ||
-                (cur_start < 0 && mode == CALLBACK_OUTPUT)) {
+            if (mode == NO_MATCHES) {
                 runShengNm(sh, q->cb, q->context, q->offset,
                            &cached_accept_state, &cached_accept_id, cur_buf,
                            cur_buf + cur_start, cur_buf + cur_end, can_die,
@@ -506,10 +504,10 @@ char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
     }
 }
 
-char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
-                     size_t length, NfaCallback cb, void *context) {
+char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                    size_t length, NfaCallback cb, void *context) {
     DEBUG_PRINTF("smallwrite Sheng\n");
-    assert(n->type == SHENG_NFA_0);
+    assert(n->type == SHENG_NFA);
     const struct sheng *sh = getImplNfa(n);
     u8 state = sh->anchored;
     u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
@@ -545,32 +543,31 @@ char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
     return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE;
 }
 
-char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end) {
     const struct sheng *sh = get_sheng(n);
     char rv = runSheng(sh, q, end, CALLBACK_OUTPUT);
     return rv;
 }
 
-char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end) {
     const struct sheng *sh = get_sheng(n);
     char rv = runSheng(sh, q, end, STOP_AT_MATCH);
     return rv;
 }
 
-char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report) {
+char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report) {
     assert(q_cur_type(q) == MQE_START);
 
     const struct sheng *sh = get_sheng(n);
     char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES);
 
-    if (rv && nfaExecSheng0_inAccept(n, report, q)) {
+    if (rv && nfaExecSheng_inAccept(n, report, q)) {
         return MO_MATCHES_PENDING;
     }
     return rv;
 }
 
-char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report,
-                            struct mq *q) {
+char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q) {
     assert(n && q);
 
     const struct sheng *sh = get_sheng(n);
@@ -586,7 +583,7 @@ char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report,
     return shengHasAccept(sh, aux, report);
 }
 
-char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q) {
+char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q) {
     assert(n && q);
 
     const struct sheng *sh = get_sheng(n);
@@ -597,9 +594,9 @@ char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q) {
     return !!aux->accept;
 }
 
-char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
-                           UNUSED const char *streamState, u64a offset,
-                           NfaCallback cb, void *ctxt) {
+char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state,
+                          UNUSED const char *streamState, u64a offset,
+                          NfaCallback cb, void *ctxt) {
     assert(nfa);
 
     const struct sheng *sh = get_sheng(nfa);
@@ -615,7 +612,7 @@ char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
     return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1);
 }
 
-char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q) {
+char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct sheng *sh = (const struct sheng *)getImplNfa(n);
     NfaCallback cb = q->cb;
     void *ctxt = q->context;
@@ -638,15 +635,15 @@ char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
-                                       void *state, UNUSED u8 key) {
+char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset,
+                                      void *state, UNUSED u8 key) {
     const struct sheng *sh = get_sheng(nfa);
     u8 *s = (u8 *)state;
     *s = offset ? sh->floating: sh->anchored;
     return !(*s & SHENG_STATE_DEAD);
 }
 
-char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q) {
+char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q) {
     assert(nfa->scratchStateSize == 1);
 
     /* starting in floating state */
@@ -656,8 +653,8 @@ char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q) {
     return 0;
 }
 
-char nfaExecSheng0_queueCompressState(UNUSED const struct NFA *nfa,
-                                      const struct mq *q, UNUSED s64a loc) {
+char nfaExecSheng_queueCompressState(UNUSED const struct NFA *nfa,
+                                     const struct mq *q, UNUSED s64a loc) {
     void *dest = q->streamState;
     const void *src = q->state;
     assert(nfa->scratchStateSize == 1);
@@ -666,9 +663,9 @@ char nfaExecSheng0_queueCompressState(UNUSED const struct NFA *nfa,
     return 0;
 }
 
-char nfaExecSheng0_expandState(UNUSED const struct NFA *nfa, void *dest,
-                               const void *src, UNUSED u64a offset,
-                               UNUSED u8 key) {
+char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
+                              const void *src, UNUSED u64a offset,
+                              UNUSED u8 key) {
     assert(nfa->scratchStateSize == 1);
     assert(nfa->streamStateSize == 1);
     *(u8 *)dest = *(const u8 *)src;
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 46ead180..84a2b6b5 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -35,27 +35,27 @@
 struct mq;
 struct NFA;
 
-#define nfaExecSheng0_B_Reverse NFA_API_NO_IMPL
-#define nfaExecSheng0_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
-char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end);
-char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end);
-char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report);
-char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report, struct mq *q);
-char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q);
-char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q);
-char nfaExecSheng0_queueCompressState(const struct NFA *nfa, const struct mq *q,
-                                      s64a loc);
-char nfaExecSheng0_expandState(const struct NFA *nfa, void *dest,
-                               const void *src, u64a offset, u8 key);
-char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
-                                       void *state, u8 key);
-char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
-                           const char *streamState, u64a offset,
-                           NfaCallback callback, void *context);
-char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                     s64a loc);
+char nfaExecSheng_expandState(const struct NFA *nfa, void *dest,
+                              const void *src, u64a offset, u8 key);
+char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset,
+                                      void *state, u8 key);
+char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state,
+                          const char *streamState, u64a offset,
+                          NfaCallback callback, void *context);
+char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 
-char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
+char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_internal.h b/src/nfa/sheng_internal.h
index 046eb759..ff843ebe 100644
--- a/src/nfa/sheng_internal.h
+++ b/src/nfa/sheng_internal.h
@@ -30,7 +30,7 @@
 #define SHENG_INTERNAL_H_
 
 #include "ue2common.h"
-#include "util/simd_utils.h"
+#include "util/simd_types.h"
 
 #define SHENG_STATE_ACCEPT 0x10
 #define SHENG_STATE_DEAD 0x20
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 911f6d70..53f2c131 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -48,7 +48,7 @@
 #include "util/compile_context.h"
 #include "util/make_unique.h"
 #include "util/verify_types.h"
-#include "util/simd_utils.h"
+#include "util/simd_types.h"
 
 #include <map>
 #include <vector>
@@ -358,7 +358,7 @@ void populateBasicInfo(struct NFA *n, dfa_info &info,
     n->scratchStateSize = 1;
     n->streamStateSize = 1;
     n->nPositions = info.size();
-    n->type = SHENG_NFA_0;
+    n->type = SHENG_NFA;
     n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
 
     sheng *s = (sheng *)getMutableImplNfa(n);
@@ -442,14 +442,12 @@ void createShuffleMasks(sheng *s, dfa_info &info,
 #ifdef DEBUG
         dumpShuffleMask(chr, buf, sizeof(buf));
 #endif
-        m128 mask = loadu128(buf);
-        s->shuffle_masks[chr] = mask;
+        memcpy(&s->shuffle_masks[chr], buf, sizeof(m128));
     }
 }
 
-bool has_accel_sheng(const NFA *nfa) {
-    const sheng *s = (const sheng *)getImplNfa(nfa);
-    return s->flags & SHENG_FLAG_HAS_ACCEL;
+bool has_accel_sheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
 }
 
 aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
index 037dfb05..ce87beaf 100644
--- a/src/nfa/shengdump.cpp
+++ b/src/nfa/shengdump.cpp
@@ -38,7 +38,8 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/dump_charclass.h"
-#include "util/simd_utils.h"
+#include "util/dump_util.h"
+#include "util/simd_types.h"
 
 
 #ifndef DUMP_SUPPORT
@@ -100,7 +101,7 @@ void dumpMasks(FILE *f, const sheng *s) {
     for (u32 chr = 0; chr < 256; chr++) {
         u8 buf[16];
         m128 shuffle_mask = s->shuffle_masks[chr];
-        store128(buf, shuffle_mask);
+        memcpy(buf, &shuffle_mask, sizeof(m128));
 
         fprintf(f, "%3u: ", chr);
         for (u32 pos = 0; pos < 16; pos++) {
@@ -115,8 +116,9 @@ void dumpMasks(FILE *f, const sheng *s) {
     }
 }
 
-void nfaExecSheng0_dumpText(const NFA *nfa, FILE *f) {
-    assert(nfa->type == SHENG_NFA_0);
+static
+void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA);
     const sheng *s = (const sheng *)getImplNfa(nfa);
 
     fprintf(f, "sheng DFA\n");
@@ -235,7 +237,7 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
         u8 buf[16];
         m128 shuffle_mask = s->shuffle_masks[i];
 
-        store128(buf, shuffle_mask);
+        memcpy(buf, &shuffle_mask, sizeof(m128));
 
         t[i] = buf[state] & SHENG_STATE_MASK;
     }
@@ -243,8 +245,9 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
     t[TOP] = aux->top & SHENG_STATE_MASK;
 }
 
-void nfaExecSheng0_dumpDot(const NFA *nfa, FILE *f, const string &) {
-    assert(nfa->type == SHENG_NFA_0);
+static
+void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA);
     const sheng *s = (const sheng *)getImplNfa(nfa);
 
     dumpDotPreambleDfa(f);
@@ -262,4 +265,14 @@ void nfaExecSheng0_dumpDot(const NFA *nfa, FILE *f, const string &) {
     fprintf(f, "}\n");
 }
 
+void nfaExecSheng_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == SHENG_NFA);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    nfaExecSheng_dumpText(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    nfaExecSheng_dumpDot(nfa, f);
+    fclose(f);
+}
+
 } // namespace ue2
diff --git a/src/nfa/shengdump.h b/src/nfa/shengdump.h
index 5334894f..2bdffeb9 100644
--- a/src/nfa/shengdump.h
+++ b/src/nfa/shengdump.h
@@ -31,16 +31,13 @@
 
 #ifdef DUMP_SUPPORT
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecSheng0_dumpDot(const struct NFA *nfa, FILE *file,
-                           const std::string &base);
-void nfaExecSheng0_dumpText(const struct NFA *nfa, FILE *file);
+void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index 903e04da..d68b1b04 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -242,6 +242,7 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
 #endif
 
     u32 z = movemask128(eq128(t2, ones));
+    DEBUG_PRINTF("    z: 0x%08x\n", z);
     return firstMatch(buf, z);
 }
 
@@ -302,6 +303,39 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     }
 }
 
+static really_inline
+const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf = vpshufb(mask, c);
+    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
+
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask = combine2x128(mask_hi, mask_lo);
+    m128 chars = loadu128(buf);
+    const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
 static really_inline
 const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
                    const m256 low4bits, const m256 zeroes) {
@@ -315,15 +349,21 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                      const u8 *buf_end) {
     assert(buf && buf_end);
     assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
 
     // Slow path for small cases.
-    if (buf_end - buf < 32) {
+    if (buf_end - buf < 16) {
         return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                              buf, buf_end);
     }
 
-    const m256 zeroes = zeroes256();
     const m256 low4bits = set32x8(0xf);
+
+    if (buf_end - buf <= 32) {
+        return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
+    }
+
+    const m256 zeroes = zeroes256();
     const m256 wide_mask_lo = set2x128(mask_lo);
     const m256 wide_mask_hi = set2x128(mask_hi);
     const u8 *rv;
@@ -365,12 +405,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 }
 
 static really_inline
-const u8 *lastMatch(const u8 *buf, m256 t, m256 compare) {
-#ifdef DEBUG
-    DEBUG_PRINTF("confirming match in:"); dumpMsk256(t); printf("\n");
-#endif
-
-    u32 z = movemask256(eq256(t, compare));
+const u8 *lastMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffffffff)) {
         u32 pos = clz32(~z);
         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
@@ -395,9 +430,45 @@ const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
     DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
 #endif
 
-    return lastMatch(buf, t, zeroes);
+    u32 z = movemask256(eq256(t, zeroes));
+    return lastMatch(buf, z);
 }
 
+static really_inline
+const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
+                        const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf = vpshufb(mask, c);
+    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
+
+    return lastMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m256 low4bits) {
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask = combine2x128(mask_hi, mask_lo);
+
+    m128 chars = loadu128(buf_end - 16);
+    const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf);
+    rv = revBlockShort(mask, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+
 /* takes 128 bit masks, but operates on 256 bits of data */
 const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                       const u8 *buf_end) {
@@ -405,13 +476,18 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     assert(buf < buf_end);
 
     // Slow path for small cases.
-    if (buf_end - buf < 64) {
+    if (buf_end - buf < 16) {
         return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                              buf, buf_end);
     }
 
-    const m256 zeroes = zeroes256();
     const m256 low4bits = set32x8(0xf);
+
+    if (buf_end - buf <= 32) {
+        return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
+    }
+
+    const m256 zeroes = zeroes256();
     const m256 wide_mask_lo = set2x128(mask_lo);
     const m256 wide_mask_hi = set2x128(mask_hi);
     const u8 *rv;
@@ -482,14 +558,56 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
     return firstMatch(buf, z);
 }
 
+static really_inline
+const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
+                         const m256 low4bits) {
+    // do the hi and lo shuffles in the one avx register
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
+    c = and256(c, low4bits);
+    m256 c_shuf1 = vpshufb(mask1, c);
+    m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);
+    m256 t0 = or256(c_shuf1, c_shuf2);
+    m128 t = or128(movdq_hi(t0), cast256to128(t0));
+    // the upper 32-bits can't match
+    u32 z = 0xffff0000U | movemask128(eq128(t, ones128()));
+
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
+    const m256 low4bits = set32x8(0xf);
+    // run shufti over two overlapping 16-byte unaligned reads
+    const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
+    const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
+    m128 chars = loadu128(buf);
+    const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits);
+    if (rv) {
+        return rv;
+    }
+
+    chars = loadu128(buf_end - 16);
+    rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
 /* takes 128 bit masks, but operates on 256 bits of data */
 const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
                            m128 mask2_lo, m128 mask2_hi,
                            const u8 *buf, const u8 *buf_end) {
+    /* we should always have at least 16 bytes */
+    assert(buf_end - buf >= 16);
+
     if (buf_end - buf < 32) {
-        // not worth it
-        return buf;
+        return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
+                                 buf_end);
     }
+
     const m256 ones = ones256();
     const m256 low4bits = set32x8(0xf);
     const m256 wide_mask1_lo = set2x128(mask1_lo);
diff --git a/src/nfa/shufticompile.cpp b/src/nfa/shufticompile.cpp
index 217fcee0..12a94b7b 100644
--- a/src/nfa/shufticompile.cpp
+++ b/src/nfa/shufticompile.cpp
@@ -51,7 +51,7 @@ namespace ue2 {
  *
  * Note: always able to construct masks for 8 or fewer characters.
  */
-int shuftiBuildMasks(const CharReach &c, m128 *lo, m128 *hi) {
+int shuftiBuildMasks(const CharReach &c, u8 *lo, u8 *hi) {
     /* Things could be packed much more optimally, but this should be able to
      * handle any set of characters entirely in the lower half.  */
 
@@ -134,7 +134,7 @@ void set_buckets_from_mask(u16 nibble_mask, u32 bucket,
 
 bool shuftiBuildDoubleMasks(const CharReach &onechar,
                             const flat_set<pair<u8, u8>> &twochar,
-                            m128 *lo1, m128 *hi1, m128 *lo2, m128 *hi2) {
+                            u8 *lo1, u8 *hi1, u8 *lo2, u8 *hi2) {
     DEBUG_PRINTF("unibytes %zu dibytes %zu\n", onechar.size(),
                  twochar.size());
     array<u8, 16> lo1_a;
@@ -210,9 +210,7 @@ bool shuftiBuildDoubleMasks(const CharReach &onechar,
 
 #ifdef DUMP_SUPPORT
 
-CharReach shufti2cr(const m128 lo_in, const m128 hi_in) {
-    const u8 *lo = (const u8 *)&lo_in;
-    const u8 *hi = (const u8 *)&hi_in;
+CharReach shufti2cr(const u8 *lo, const u8 *hi) {
     CharReach cr;
     for (u32 i = 0; i < 256; i++) {
         if (lo[(u8)i & 0xf] & hi[(u8)i >> 4]) {
diff --git a/src/nfa/shufticompile.h b/src/nfa/shufticompile.h
index 59126b0b..a72904e0 100644
--- a/src/nfa/shufticompile.h
+++ b/src/nfa/shufticompile.h
@@ -48,7 +48,7 @@ namespace ue2 {
  *
  * Note: always able to construct masks for 8 or fewer characters.
  */
-int shuftiBuildMasks(const CharReach &chars, m128 *lo, m128 *hi);
+int shuftiBuildMasks(const CharReach &chars, u8 *lo, u8 *hi);
 
 /** \brief Double-byte variant
  *
@@ -56,7 +56,7 @@ int shuftiBuildMasks(const CharReach &chars, m128 *lo, m128 *hi);
  */
 bool shuftiBuildDoubleMasks(const CharReach &onechar,
                             const flat_set<std::pair<u8, u8>> &twochar,
-                            m128 *lo1, m128 *hi1, m128 *lo2, m128 *hi2);
+                            u8 *lo1, u8 *hi1, u8 *lo2, u8 *hi2);
 
 #ifdef DUMP_SUPPORT
 
@@ -64,7 +64,7 @@ bool shuftiBuildDoubleMasks(const CharReach &onechar,
  * \brief Dump code: returns a CharReach with the reach that would match this
  * shufti.
  */
-CharReach shufti2cr(const m128 lo, const m128 hi);
+CharReach shufti2cr(const u8 *lo, const u8 *hi);
 
 #endif // DUMP_SUPPORT
 
diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
index b5f90e85..43480f06 100644
--- a/src/nfa/tamarama.c
+++ b/src/nfa/tamarama.c
@@ -265,9 +265,9 @@ void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
 #endif
 }
 
-char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
-                              const char *streamState, u64a offset,
-                              NfaCallback callback, void *context) {
+char nfaExecTamarama_testEOD(const struct NFA *n, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -285,8 +285,7 @@ char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
     return MO_CONTINUE_MATCHING;
 }
 
-char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q,
-                         ReportID report) {
+char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report) {
     DEBUG_PRINTF("exec rose\n");
     struct mq q1;
     q1.cur = q1.end = 0;
@@ -304,7 +303,7 @@ char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q,
     return rv;
 }
 
-char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q) {
+char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -317,8 +316,8 @@ char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q) {
     return nfaReportCurrentMatches(sub, &q1);
 }
 
-char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
-                               struct mq *q) {
+char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -331,7 +330,7 @@ char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
     return nfaInAcceptState(sub, report, &q1);
 }
 
-char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q) {
+char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -344,7 +343,7 @@ char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q) {
     return nfaInAnyAcceptState(sub, &q1);
 }
 
-char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q) {
+char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q) {
     DEBUG_PRINTF("init state\n");
     const struct Tamarama *t = getImplNfa(n);
     char *ptr = q->streamState;
@@ -354,8 +353,8 @@ char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q) {
     return 0;
 }
 
-char nfaExecTamarama0_queueCompressState(const struct NFA *n,
-                                         const struct mq *q, s64a loc) {
+char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q,
+                                        s64a loc) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -369,8 +368,8 @@ char nfaExecTamarama0_queueCompressState(const struct NFA *n,
     return nfaQueueCompressState(sub, &q1, loc);
 }
 
-char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
-                                  const void *src, u64a offset, u8 key) {
+char nfaExecTamarama_expandState(const struct NFA *n, void *dest,
+                                 const void *src, u64a offset, u8 key) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(src, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -383,8 +382,8 @@ char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
     return nfaExpandState(sub, dest, subStreamState, offset, key);
 }
 
-enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
-                                                      struct mq *q, s64a loc) {
+enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n,
+                                                     struct mq *q, s64a loc) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -397,7 +396,7 @@ enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
     return nfaGetZombieStatus(sub, &q1, loc);
 }
 
-char nfaExecTamarama0_Q(const struct NFA *n, struct mq *q, s64a end) {
+char nfaExecTamarama_Q(const struct NFA *n, struct mq *q, s64a end) {
     DEBUG_PRINTF("exec\n");
     struct mq q1;
     char rv = MO_ALIVE;
@@ -418,8 +417,7 @@ char nfaExecTamarama0_Q(const struct NFA *n, struct mq *q, s64a end) {
     return rv;
 }
 
-char nfaExecTamarama0_Q2(const struct NFA *n,
-                         struct mq *q, s64a end) {
+char nfaExecTamarama_Q2(const struct NFA *n, struct mq *q, s64a end) {
     DEBUG_PRINTF("exec to match\n");
     struct mq q1;
     char rv = 0;
diff --git a/src/nfa/tamarama.h b/src/nfa/tamarama.h
index 7ccfa5a0..3b52d8de 100644
--- a/src/nfa/tamarama.h
+++ b/src/nfa/tamarama.h
@@ -41,28 +41,27 @@ struct mq;
 struct NFA;
 struct hs_scratch;
 
-char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
-                              const char *streamState, u64a offset,
-                              NfaCallback callback, void *context);
-char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q, ReportID report);
-char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q);
-char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
-                               struct mq *q);
-char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q);
-char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q);
-char nfaExecTamarama0_queueCompressState(const struct NFA *n,
-                                         const struct mq *q,
-                                         s64a loc);
-char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
-                                  const void *src, u64a offset, u8 key);
-enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
-                                                      struct mq *q, s64a loc);
-char nfaExecTamarama0_Q(const struct NFA *nfa, struct mq *q, s64a end);
-char nfaExecTamarama0_Q2(const struct NFA *nfa, struct mq *q, s64a end);
+char nfaExecTamarama_testEOD(const struct NFA *n, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context);
+char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q,
+                                        s64a loc);
+char nfaExecTamarama_expandState(const struct NFA *n, void *dest,
+                                 const void *src, u64a offset, u8 key);
+enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n,
+                                                     struct mq *q, s64a loc);
+char nfaExecTamarama_Q(const struct NFA *nfa, struct mq *q, s64a end);
+char nfaExecTamarama_Q2(const struct NFA *nfa, struct mq *q, s64a end);
 
 // only used by outfix and miracles, no implementation for tamarama
-#define nfaExecTamarama0_initCompressedState NFA_API_NO_IMPL
-#define nfaExecTamarama0_B_Reverse NFA_API_NO_IMPL
+#define nfaExecTamarama_initCompressedState NFA_API_NO_IMPL
+#define nfaExecTamarama_B_Reverse NFA_API_NO_IMPL
 
 #ifdef __cplusplus
 }
diff --git a/src/nfa/tamarama_dump.cpp b/src/nfa/tamarama_dump.cpp
index 181fa9af..88cb33cc 100644
--- a/src/nfa/tamarama_dump.cpp
+++ b/src/nfa/tamarama_dump.cpp
@@ -38,6 +38,7 @@
 #include "nfa_dump_api.h"
 #include "nfa_dump_internal.h"
 #include "nfa_internal.h"
+#include "util/dump_util.h"
 
 #include <string>
 #include <sstream>
@@ -46,27 +47,14 @@
 #error No dump support!
 #endif
 
+using namespace std;
+
 namespace ue2 {
 
-void nfaExecTamarama0_dumpDot(const struct NFA *nfa, UNUSED FILE *f,
-                              const std::string &base) {
+void nfaExecTamarama_dump(const struct NFA *nfa, const string &base) {
     const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
-    const u32 *subOffset =
-        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
-                      t->numSubEngines * sizeof(u32));
-    for (u32 i = 0; i < t->numSubEngines; i++) {
-        std::stringstream ssdot;
-        ssdot << base << "rose_nfa_" << nfa->queueIndex
-            << "_sub_" << i << ".dot";
-        const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
-        FILE *f1 = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(sub, f1, base);
-        fclose(f1);
-    }
-}
 
-void nfaExecTamarama0_dumpText(const struct NFA *nfa, FILE *f) {
-    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
 
     fprintf(f, "Tamarama container engine\n");
     fprintf(f, "\n");
@@ -75,15 +63,17 @@ void nfaExecTamarama0_dumpText(const struct NFA *nfa, FILE *f) {
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
     fprintf(f, "\n");
+    fclose(f);
 
     const u32 *subOffset =
         (const u32 *)((const char *)t + sizeof(struct Tamarama) +
                       t->numSubEngines * sizeof(u32));
     for (u32 i = 0; i < t->numSubEngines; i++) {
-        fprintf(f, "Sub %u:\n", i);
         const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
-        nfaDumpText(sub, f);
-        fprintf(f, "\n");
+
+        stringstream sssub;
+        sssub << base << "_sub_" << i;
+        nfaGenerateDumpFiles(sub, sssub.str());
     }
 }
 
diff --git a/src/nfa/tamarama_dump.h b/src/nfa/tamarama_dump.h
index dc976004..f40b7ecf 100644
--- a/src/nfa/tamarama_dump.h
+++ b/src/nfa/tamarama_dump.h
@@ -31,16 +31,13 @@
 
 #if defined(DUMP_SUPPORT)
 
-#include <cstdio>
 #include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecTamarama0_dumpDot(const NFA *nfa, FILE *file,
-                              const std::string &base);
-void nfaExecTamarama0_dumpText(const NFA *nfa, FILE *file);
+void nfaExecTamarama_dump(const NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/tamaramacompile.cpp b/src/nfa/tamaramacompile.cpp
index 73d19595..c28caacb 100644
--- a/src/nfa/tamaramacompile.cpp
+++ b/src/nfa/tamaramacompile.cpp
@@ -117,7 +117,7 @@ aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
     remapTops(tamaInfo, top_base, out_top_remap);
 
     size_t subSize = tamaInfo.subengines.size();
-    DEBUG_PRINTF("subSize:%lu\n", subSize);
+    DEBUG_PRINTF("subSize:%zu\n", subSize);
     size_t total_size =
         sizeof(NFA) +               // initial NFA structure
         sizeof(Tamarama) +          // Tamarama structure
@@ -134,7 +134,7 @@ aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
     // so add one to subSize here
     u32 activeIdxSize = calcPackedBytes(subSize + 1);
     aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
-    nfa->type = verify_u8(TAMARAMA_NFA_0);
+    nfa->type = verify_u8(TAMARAMA_NFA);
     nfa->length = verify_u32(total_size);
     nfa->queueIndex = queue;
 
diff --git a/src/nfa/trufflecompile.cpp b/src/nfa/trufflecompile.cpp
index 6bde7abb..9442d046 100644
--- a/src/nfa/trufflecompile.cpp
+++ b/src/nfa/trufflecompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/simd_types.h"
-#include "util/simd_utils.h"
+
 #include "util/dump_mask.h"
 
 using namespace std;
@@ -53,17 +53,15 @@ namespace ue2 {
  * bits 456 is the bit that is set at that offset.
  */
 
-void truffleBuildMasks(const CharReach &cr, m128 *shuf_mask_lo_highclear,
-                       m128 *shuf_mask_lo_highset) {
-    *shuf_mask_lo_highset = zeroes128();
-    *shuf_mask_lo_highclear = zeroes128();
-    u8 *lo_highset = (u8 *)shuf_mask_lo_highset;
-    u8 *lo_highclear = (u8 *)shuf_mask_lo_highclear;
+void truffleBuildMasks(const CharReach &cr, u8 *shuf_mask_lo_highclear,
+                       u8 *shuf_mask_lo_highset) {
+    memset(shuf_mask_lo_highset, 0, sizeof(m128));
+    memset(shuf_mask_lo_highclear, 0, sizeof(m128));
 
     for (size_t v = cr.find_first(); v != CharReach::npos;
          v = cr.find_next(v)) {
         DEBUG_PRINTF("adding 0x%02x to %s\n", (u8)v, (v & 0x80) ? "highset" : "highclear");
-        u8 *change_mask = (v & 0x80) ? lo_highset : lo_highclear;
+        u8 *change_mask = (v & 0x80) ? shuf_mask_lo_highset : shuf_mask_lo_highclear;
         u8 low_nibble = v & 0xf;
         u8 bits_456 = (v & 0x70) >> 4;
         change_mask[low_nibble] |= 1 << bits_456;
@@ -73,18 +71,16 @@ void truffleBuildMasks(const CharReach &cr, m128 *shuf_mask_lo_highclear,
 /*
  * Reconstruct the charclass that the truffle masks represent
  */
-CharReach truffle2cr(const m128 highclear, const m128 highset) {
-    const u8 *lo = (const u8 *)&highclear;
-    const u8 *hi = (const u8 *)&highset;
+CharReach truffle2cr(const u8 *highclear, const u8 *highset) {
     CharReach cr;
     for (u8 i = 0; i < 16; i++) {
-        u32 bits_456 = lo[i];
+        u32 bits_456 = highclear[i];
         while (bits_456) {
             u32 pos = findAndClearLSB_32(&bits_456);
             assert(pos < 8);
             cr.set(pos << 4 | i);
         }
-        bits_456 = hi[i];
+        bits_456 = highset[i];
         while (bits_456) {
             u32 pos = findAndClearLSB_32(&bits_456);
             assert(pos < 8);
diff --git a/src/nfa/trufflecompile.h b/src/nfa/trufflecompile.h
index 19d3eb54..14b314f3 100644
--- a/src/nfa/trufflecompile.h
+++ b/src/nfa/trufflecompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,8 +34,8 @@
 
 namespace ue2 {
 
-void truffleBuildMasks(const CharReach &cr, m128 *mask1, m128 *mask2);
-CharReach truffle2cr(const m128 lo_in, const m128 hi_in);
+void truffleBuildMasks(const CharReach &cr, u8 *mask1, u8 *mask2);
+CharReach truffle2cr(const u8 *lo_in, const u8 *hi_in);
 
 }
 
diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h
index ba8afcf1..817e681a 100644
--- a/src/nfa/vermicelli.h
+++ b/src/nfa/vermicelli.h
@@ -74,9 +74,7 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
         }
 
         buf += VERM_BOUNDARY - min;
-        if (buf >= buf_end) {
-            return buf_end;
-        }
+        assert(buf < buf_end);
     }
 
     // Aligned loops from here on in
@@ -129,9 +127,7 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
         }
 
         buf += VERM_BOUNDARY - min;
-        if (buf >= buf_end) {
-            return buf_end;
-        }
+        assert(buf < buf_end);
     }
 
     // Aligned loops from here on in
@@ -172,9 +168,7 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
         }
 
         buf += VERM_BOUNDARY - min;
-        if (buf >= buf_end) {
-            return buf_end - 1;
-        }
+        assert(buf < buf_end);
     }
 
     // Aligned loops from here on in
@@ -190,9 +184,19 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
     ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
                                            buf_end - VERM_BOUNDARY)
                  : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
-    /* buf_end - 1 to be conservative in case last byte is a partial match */
-    return ptr ? ptr :  buf_end - 1;
 
+    if (ptr) {
+        return ptr;
+    }
+
+    /* check for partial match at end */
+    u8 mask = nocase ? CASE_CLEAR : 0xff;
+    if ((buf_end[-1] & mask) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
 }
 
 static really_inline
@@ -220,9 +224,7 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
         }
 
         buf += VERM_BOUNDARY - min;
-        if (buf >= buf_end) {
-            return buf_end - 1;
-        }
+        assert(buf < buf_end);
     }
 
     // Aligned loops from here on in
@@ -235,9 +237,17 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
     // Tidy up the mess at the end
     ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
                                   buf_end - VERM_BOUNDARY);
-    /* buf_end - 1 to be conservative in case last byte is a partial match */
-    return ptr ? ptr : buf_end - 1;
 
+    if (ptr) {
+        return ptr;
+    }
+
+    /* check for partial match at end */
+    if ((buf_end[-1] & m1) == (u8)c1) {
+        return buf_end - 1;
+    }
+
+    return buf_end;
 }
 
 // Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index deca3fd5..dff9c7e8 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -203,6 +203,7 @@ static
 bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
                   const u32 comp_id) {
     const CompileContext &cc = ng.cc;
+    assert(hasCorrectlyNumberedVertices(g));
 
     DEBUG_PRINTF("expr=%u, comp=%u: %zu vertices, %zu edges\n",
                  w.expressionIndex, comp_id, num_vertices(g), num_edges(g));
@@ -421,6 +422,7 @@ bool NG::addGraph(NGWrapper &w) {
     // Perform a reduction pass to merge sibling character classes together.
     if (cc.grey.performGraphSimplification) {
         removeRedundancy(w, som);
+        prunePathsRedundantWithSuccessorOfCyclics(w, som);
     }
 
     dumpDotWrapper(w, "04_reduced", cc.grey);
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index ba352e60..ed9c7f48 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -202,7 +202,7 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
     }
 
     if (!isStartNode(dotV, g.start, g, true)) {
-        DEBUG_PRINTF("fleeing: vertex %u has other preds\n", g[dotV].index);
+        DEBUG_PRINTF("fleeing: vertex %zu has other preds\n", g[dotV].index);
         return;
     }
 
@@ -249,7 +249,7 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
         remove_edge(g.start, v, g);
     }
 
-    DEBUG_PRINTF("removing vertex %u\n", g[dotV].index);
+    DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
     clear_vertex(dotV, g);
     dead.insert(dotV);
     compAnchoredStarts.erase(dotV);
@@ -313,14 +313,15 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
             }
 
             // A self-loop indicates that this is a '.+' or '.*'
-            DEBUG_PRINTF("self-loop detected on %u\n", g[dotV].index);
+            DEBUG_PRINTF("self-loop detected on %zu\n", g[dotV].index);
             *startEnd = depth::infinity();
             remove_edge(dotV, dotV, g);
             return;
         }
 
         if (!isStartNode(dotV, g.startDs, g, true)) {
-            DEBUG_PRINTF("fleeing: vertex %u has other preds\n", g[dotV].index);
+            DEBUG_PRINTF("fleeing: vertex %zu has other preds\n",
+                         g[dotV].index);
             return;
         }
 
@@ -362,14 +363,14 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
         compUnanchoredStarts.clear();
         for (auto t : adjacent_vertices_range(dotV, g)) {
             if (t != dotV) {
-                DEBUG_PRINTF("connecting sds -> %u\n", g[t].index);
+                DEBUG_PRINTF("connecting sds -> %zu\n", g[t].index);
                 add_edge(g.startDs, t, g);
                 add_edge(g.start, t, g);
                 compUnanchoredStarts.insert(t);
             }
         }
 
-        DEBUG_PRINTF("removing vertex %u\n", g[dotV].index);
+        DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
         dead.insert(dotV);
         clear_vertex(dotV, g);
         compUnanchoredStarts.erase(dotV);
@@ -416,7 +417,7 @@ bool gatherParticipants(const NGHolder &g,
         if (isOptionalDot(t, v, g)) {
             // another dot; bail if we've seen it once already
             if (dots.find(t) != dots.end()) {
-                DEBUG_PRINTF("cycle detected at vertex %u\n", g[t].index);
+                DEBUG_PRINTF("cycle detected at vertex %zu\n", g[t].index);
                 return false;
             }
             dots.insert(t);
@@ -432,7 +433,7 @@ bool gatherParticipants(const NGHolder &g,
     for (auto w : adjacent_vertices_range(v, g)) {
         succ.insert(w);
         if (!edge(start, w, g).second) {
-            DEBUG_PRINTF("failing, vertex %u does not have edge from start\n",
+            DEBUG_PRINTF("failing, vertex %zu does not have edge from start\n",
                          g[w].index);
             return false;
         }
@@ -474,7 +475,7 @@ void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
                 return;
             }
             initialDot = v;
-            DEBUG_PRINTF("initial dot vertex is %u\n", g[v].index);
+            DEBUG_PRINTF("initial dot vertex is %zu\n", g[v].index);
         }
     }
 
@@ -507,12 +508,8 @@ void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
     }
     assert(startEnd->is_reachable());
 
-    // For determinism, copy and sort our successor vertices.
-    deque<NFAVertex> s(succ.begin(), succ.end());
-    sort(s.begin(), s.end(), make_index_ordering(g));
-
     // Connect our successor vertices to both start and startDs.
-    for (auto v : s) {
+    for (auto v : succ) {
         add_edge_if_not_present(g.start, v, g);
         add_edge_if_not_present(g.startDs, v, g);
     }
@@ -637,8 +634,8 @@ void restoreLeadingDots(NGHolder &g, const depth &startBegin,
     }
 
     addDotsBetween(g, root, rhs, startBegin, startEnd);
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
 }
 
 // Entry point.
diff --git a/src/nfagraph/ng_asserts.cpp b/src/nfagraph/ng_asserts.cpp
index e9e39345..c2f0d68f 100644
--- a/src/nfagraph/ng_asserts.cpp
+++ b/src/nfagraph/ng_asserts.cpp
@@ -101,7 +101,7 @@ vector<NFAEdge> getAsserts(const NGHolder &g) {
 
 static
 void addToSplit(const NGHolder &g, NFAVertex v, map<u32, NFAVertex> *to_split) {
-    DEBUG_PRINTF("%u needs splitting\n", g[v].index);
+    DEBUG_PRINTF("%zu needs splitting\n", g[v].index);
     to_split->emplace(g[v].index, v);
 }
 
@@ -194,7 +194,7 @@ void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
     Report ir = rm.getBasicInternalReport(g, adj);
 
     g[v].reports.insert(rm.getInternalId(ir));
-    DEBUG_PRINTF("set report id for vertex %u, adj %d\n", g[v].index, adj);
+    DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
 }
 
 static
@@ -224,7 +224,7 @@ void splitVertex(ReportManager &rm, NGWrapper &g, NFAVertex v, bool ucp) {
     assert(v != g.start);
     assert(v != g.accept);
     assert(v != g.acceptEod);
-    DEBUG_PRINTF("partitioning vertex %u ucp:%d\n", g[v].index, (int)ucp);
+    DEBUG_PRINTF("partitioning vertex %zu ucp:%d\n", g[v].index, (int)ucp);
 
     CharReach cr_word = ucp ? CHARREACH_WORD_UCP_PRE : CHARREACH_WORD;
     CharReach cr_nonword = ucp ? CHARREACH_NONWORD_UCP_PRE : CHARREACH_NONWORD;
@@ -267,8 +267,8 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
 
         bool impassable = true;
         bool ucp = flags & UCP_ASSERT_FLAGS;
-        DEBUG_PRINTF("resolving edge %u->%u (flags=0x%x, ucp=%d)\n", g[u].index,
-                     g[v].index, flags, (int)ucp);
+        DEBUG_PRINTF("resolving edge %zu->%zu (flags=0x%x, ucp=%d)\n",
+                     g[u].index, g[v].index, flags, (int)ucp);
         while (flags && impassable) {
             u32 flag = 1U << findAndClearLSB_32(&flags);
             switch (flag) {
@@ -377,17 +377,14 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
                 add_edge(u, vv, g[e], g);
-                if (!edge(u, g.acceptEod, g).second) {
-                    add_edge(u, g.acceptEod, g[e], g);
-                } else {
-                    /* there may already be a different edge from start to eod
-                     * if so we need to make it unconditional and alive
-                     */
-                    NFAEdge start_eod = edge(u, g.acceptEod, g).first;
-
+                /* there may already be a different edge from start to eod if so
+                 * we need to make it unconditional and alive
+                 */
+                if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
                     g[start_eod].assert_flags = 0;
                     dead->erase(start_eod);
-
+                } else {
+                    add_edge(u, g.acceptEod, g[e], g);
                 }
                 dead->insert(e);
             }
@@ -433,17 +430,14 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
                 add_edge(u, vv, g[e], g);
-                if (!edge(u, g.acceptEod, g).second) {
-                    add_edge(u, g.acceptEod, g[e], g);
-                } else {
-                    /* there may already be a different edge from start to eod
-                     * if so we need to make it unconditional and alive
-                     */
-                    NFAEdge start_eod = edge(u, g.acceptEod, g).first;
-
+                /* there may already be a different edge from start to eod if so
+                 * we need to make it unconditional and alive
+                 */
+                if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
                     g[start_eod].assert_flags = 0;
                     dead->erase(start_eod);
-
+                } else {
+                    add_edge(u, g.acceptEod, g[e], g);
                 }
                 dead->insert(e);
             }
@@ -482,12 +476,12 @@ void resolveAsserts(ReportManager &rm, NGWrapper &g) {
     resolveEdges(rm, g, &dead);
 
     remove_edges(dead, g);
-    g.renumberVertices();
+    renumber_vertices(g);
     pruneUseless(g);
     pruneEmptyVertices(g);
 
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
     clearReports(g);
 }
 
@@ -496,10 +490,8 @@ void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
      * boundaries. Assert resolution handles the badness coming from asserts.
      * The only other source of trouble is startDs->accept connections.
      */
-    bool exists;
-    NFAEdge orig;
-    tie(orig, exists) = edge(g.startDs, g.accept, g);
-    if (g.utf8 && exists) {
+    NFAEdge orig = edge(g.startDs, g.accept, g);
+    if (g.utf8 && orig) {
         DEBUG_PRINTF("rectifying %u\n", g.reportId);
         Report ir = rm.getBasicInternalReport(g);
         ReportID rep = rm.getInternalId(ir);
@@ -552,7 +544,7 @@ void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
         add_edge(g.start, v_4, g);
         add_edge(g.startDs, v_4, g);
         remove_edge(orig, g);
-        g.renumberEdges();
+        renumber_edges(g);
         clearReports(g);
     }
 }
diff --git a/src/nfagraph/ng_builder.cpp b/src/nfagraph/ng_builder.cpp
index 8a92b7ee..4ca0b37e 100644
--- a/src/nfagraph/ng_builder.cpp
+++ b/src/nfagraph/ng_builder.cpp
@@ -132,7 +132,7 @@ NFAVertex NFABuilderImpl::getVertex(Position pos) const {
     assert(id2vertex.size() >= pos);
     const NFAVertex v = id2vertex[pos];
     assert(v != NGHolder::null_vertex());
-    assert(graph->g[v].index == pos);
+    assert((*graph)[v].index == pos);
     return v;
 }
 
@@ -147,7 +147,7 @@ void NFABuilderImpl::addVertex(Position pos) {
         id2vertex.resize(pos + 1);
     }
     id2vertex[pos] = v;
-    graph->g[v].index = pos;
+    (*graph)[v].index = pos;
 }
 
 unique_ptr<NGWrapper> NFABuilderImpl::getGraph() {
@@ -177,26 +177,24 @@ void NFABuilderImpl::setNodeReportID(Position pos, int offsetAdjust) {
 
 void NFABuilderImpl::addCharReach(Position pos, const CharReach &cr) {
     NFAVertex v = getVertex(pos);
-    graph->g[v].char_reach |= cr;
+    (*graph)[v].char_reach |= cr;
 }
 
 void NFABuilderImpl::setAssertFlag(Position pos, u32 flag) {
     NFAVertex v = getVertex(pos);
-    graph->g[v].assert_flags |= flag;
+    (*graph)[v].assert_flags |= flag;
 }
 
 u32 NFABuilderImpl::getAssertFlag(Position pos) {
     NFAVertex v = getVertex(pos);
-    return graph->g[v].assert_flags;
+    return (*graph)[v].assert_flags;
 }
 
 pair<NFAEdge, bool> NFABuilderImpl::addEdge(NFAVertex u, NFAVertex v) {
     // assert that the edge doesn't already exist
-    assert(edge(u, v, graph->g).second == false);
+    assert(edge(u, v, *graph).second == false);
 
-    pair<NFAEdge, bool> e = add_edge(u, v, *graph);
-    assert(e.second);
-    return e;
+    return add_edge(u, v, *graph);
 }
 
 void NFABuilderImpl::addEdge(Position startPos, Position endPos) {
@@ -209,16 +207,16 @@ void NFABuilderImpl::addEdge(Position startPos, Position endPos) {
 
     if ((u == graph->start || u == graph->startDs) && v == graph->startDs) {
         /* standard special -> special edges already exist */
-        assert(edge(u, v, graph->g).second == true);
+        assert(edge(u, v, *graph).second == true);
         return;
     }
 
-    assert(edge(u, v, graph->g).second == false);
+    assert(edge(u, v, *graph).second == false);
     addEdge(u, v);
 }
 
 bool NFABuilderImpl::hasEdge(Position startPos, Position endPos) const {
-    return edge(getVertex(startPos), getVertex(endPos), graph->g).second;
+    return edge(getVertex(startPos), getVertex(endPos), *graph).second;
 }
 
 Position NFABuilderImpl::getStart() const {
@@ -252,7 +250,7 @@ Position NFABuilderImpl::makePositions(size_t nPositions) {
 }
 
 void NFABuilderImpl::cloneRegion(Position first, Position last, unsigned posOffset) {
-    NFAGraph &g = graph->g;
+    NGHolder &g = *graph;
     assert(posOffset > 0);
 
     // walk the nodes between first and last and copy their vertex properties
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index 658e7001..da6775e4 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -162,7 +162,7 @@ flat_set<NFAVertex> findHeadShell(const NGHolder &g,
     }
 
     for (UNUSED auto v : shell) {
-        DEBUG_PRINTF("shell: %u\n", g[v].index);
+        DEBUG_PRINTF("shell: %zu\n", g[v].index);
     }
 
     return shell;
@@ -184,7 +184,7 @@ flat_set<NFAVertex> findTailShell(const NGHolder &g,
     }
 
     for (UNUSED auto v : shell) {
-        DEBUG_PRINTF("shell: %u\n", g[v].index);
+        DEBUG_PRINTF("shell: %zu\n", g[v].index);
     }
 
     return shell;
@@ -209,7 +209,8 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
 
         if ((is_special(u, g) || contains(head_shell, u)) &&
             (is_special(v, g) || contains(tail_shell, v))) {
-            DEBUG_PRINTF("edge (%u,%u) is a shell edge\n", g[u].index, g[v].index);
+            DEBUG_PRINTF("edge (%zu,%zu) is a shell edge\n", g[u].index,
+                         g[v].index);
             shell_edges.push_back(e);
         }
     }
@@ -275,9 +276,8 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
 
     NFAUndirectedGraph ug;
     ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
-    ue2::unordered_map<u32, NFAVertex> newIdx2old;
 
-    createUnGraph(g.g, true, true, ug, old2new, newIdx2old);
+    createUnGraph(g, true, true, ug, old2new);
 
     // Construct reverse mapping.
     ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
@@ -313,7 +313,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         assert(contains(new2old, uv));
         NFAVertex v = new2old.at(uv);
         verts[c].push_back(v);
-        DEBUG_PRINTF("vertex %u is in comp %u\n", g[v].index, c);
+        DEBUG_PRINTF("vertex %zu is in comp %u\n", g[v].index, c);
     }
 
     ue2::unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
@@ -322,8 +322,9 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         vv.insert(vv.end(), begin(head_shell), end(head_shell));
         vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
 
-        // Sort by vertex index for determinism.
-        sort(begin(vv), end(vv), VertexIndexOrdering<NGHolder>(g));
+        /* Sort for determinism. Still required as NFAUndirectedVertex have
+         * no deterministic ordering (split_components map). */
+        sort(begin(vv), end(vv));
 
         auto gc = ue2::make_unique<NGHolder>();
         v_map.clear();
@@ -349,9 +350,6 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         vv.insert(vv.end(), begin(head_shell), end(head_shell));
         vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
 
-        // Sort by vertex index for determinism.
-        sort(begin(vv), end(vv), VertexIndexOrdering<NGHolder>(g));
-
         auto gc = ue2::make_unique<NGHolder>();
         v_map.clear();
         fillHolder(gc.get(), g, vv, &v_map);
diff --git a/src/nfagraph/ng_cyclic_redundancy.cpp b/src/nfagraph/ng_cyclic_redundancy.cpp
index e2272264..9ae4458c 100644
--- a/src/nfagraph/ng_cyclic_redundancy.cpp
+++ b/src/nfagraph/ng_cyclic_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -99,7 +99,7 @@ class SearchVisitor : public boost::default_dfs_visitor {
 
         template<class Vertex, class Graph>
         void discover_vertex(const Vertex &v, const Graph &g) const {
-            DEBUG_PRINTF("vertex %u\n", g[v].index);
+            DEBUG_PRINTF("vertex %zu\n", g[v].index);
             if (is_special(v, g)) {
                 DEBUG_PRINTF("start or accept\n");
                 throw SearchFailed();
@@ -141,24 +141,16 @@ bool searchForward(const Graph &g, const CharReach &reach,
 }
 
 static
-NFAEdge to_raw(const NFAEdge &e, const NFAGraph &, const NGHolder &) {
+NFAEdge to_raw(const NFAEdge &e, const NGHolder &) {
     return e;
 }
 
 static
-NFAEdge to_raw(const reverse_graph<NFAGraph, NFAGraph&>::edge_descriptor &e,
-               const reverse_graph<NFAGraph, NFAGraph&> &g,
-               const NGHolder &raw) {
-    /* clang doesn't seem to like edge_underlying */
-    NFAVertex t = source(e, g);
-    NFAVertex s = target(e, g);
-
-    assert(edge(s, t, raw).second);
-
-    return edge(s, t, raw).first;
+NFAEdge to_raw(const reverse_graph<NGHolder, NGHolder &>::edge_descriptor &e,
+               const reverse_graph<NGHolder, NGHolder &> &g) {
+    return get(boost::edge_underlying, g, e);
 }
 
-
 /* returns true if we did stuff */
 template<class Graph>
 static
@@ -185,7 +177,7 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
             continue;
         }
 
-        DEBUG_PRINTF("- checking u %u\n", g[u].index);
+        DEBUG_PRINTF("- checking u %zu\n", g[u].index);
 
         // let s be intersection(succ(u), succ(v))
         s.clear();
@@ -206,17 +198,18 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
                 continue;
             }
 
-            DEBUG_PRINTF("  - checking w %u\n", g[w].index);
+            DEBUG_PRINTF("  - checking w %zu\n", g[w].index);
 
-            if (searchForward(g, reach, s, w)) {
-                DEBUG_PRINTF("removing edge (%u,%u)\n",
-                             g[u].index, g[w].index);
-                /* we are currently iterating over the in-edges of v, so it
-                   would be unwise to remove edges to v. However, */
-                assert(w != v); /* as v is in s */
-                remove_edge(to_raw(e_u, g, raw), raw);
-                did_stuff = true;
+            if (!searchForward(g, reach, s, w)) {
+                continue;
             }
+
+            DEBUG_PRINTF("removing edge (%zu,%zu)\n", g[u].index, g[w].index);
+            /* we are currently iterating over the in-edges of v, so it
+               would be unwise to remove edges to v. However, */
+            assert(w != v); /* as v is in s */
+            remove_edge(to_raw(e_u, g), raw);
+            did_stuff = true;
         }
     }
 
@@ -233,7 +226,7 @@ bool cyclicPathRedundancyPass(Graph &g, NGHolder &raw) {
             continue;
         }
 
-        DEBUG_PRINTF("examining cyclic vertex %u\n", g[v].index);
+        DEBUG_PRINTF("examining cyclic vertex %zu\n", g[v].index);
         did_stuff |= removeCyclicPathRedundancy(g, v, raw);
     }
 
@@ -242,7 +235,7 @@ bool cyclicPathRedundancyPass(Graph &g, NGHolder &raw) {
 
 bool removeCyclicPathRedundancy(NGHolder &g) {
     // Forward pass.
-    bool f_changed = cyclicPathRedundancyPass(g.g, g);
+    bool f_changed = cyclicPathRedundancyPass(g, g);
     if (f_changed) {
         DEBUG_PRINTF("edges removed by forward pass\n");
         pruneUseless(g);
@@ -250,8 +243,8 @@ bool removeCyclicPathRedundancy(NGHolder &g) {
 
     // Reverse pass.
     DEBUG_PRINTF("REVERSE PASS\n");
-    typedef reverse_graph<NFAGraph, NFAGraph&> RevGraph;
-    RevGraph revg(g.g);
+    typedef reverse_graph<NGHolder, NGHolder &> RevGraph;
+    RevGraph revg(g);
     bool r_changed = cyclicPathRedundancyPass(revg, g);
     if (r_changed) {
         DEBUG_PRINTF("edges removed by reverse pass\n");
diff --git a/src/nfagraph/ng_depth.cpp b/src/nfagraph/ng_depth.cpp
index d7945be9..63e0e46b 100644
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,11 +44,14 @@
 #include <boost/graph/reverse_graph.hpp>
 #include <boost/graph/topological_sort.hpp>
 #include <boost/graph/property_maps/constant_property_map.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 using namespace std;
 using boost::filtered_graph;
+using boost::make_filtered_graph;
 using boost::make_constant_property;
 using boost::reverse_graph;
+using boost::adaptors::reverse;
 
 namespace ue2 {
 
@@ -122,25 +125,23 @@ private:
 
 template<class GraphT>
 static
-void findLoopReachable(const GraphT &g, const NFAVertex srcVertex,
+void findLoopReachable(const GraphT &g,
+                       const typename GraphT::vertex_descriptor srcVertex,
                        vector<bool> &deadNodes) {
     typedef typename GraphT::edge_descriptor EdgeT;
+    typedef typename GraphT::vertex_descriptor VertexT;
     typedef set<EdgeT> EdgeSet;
 
     EdgeSet deadEdges;
     BackEdges<EdgeSet> be(deadEdges);
 
-    auto index_map = get(&NFAGraphVertexProps::index, g);
+    depth_first_search(g, visitor(be).root_vertex(srcVertex));
+    auto af = make_bad_edge_filter(&deadEdges);
+    auto acyclic_g = make_filtered_graph(g, af);
 
-    depth_first_search(g, visitor(be).root_vertex(srcVertex).vertex_index_map(
-                              index_map));
-    AcyclicFilter<EdgeSet> af(&deadEdges);
-    filtered_graph<GraphT, AcyclicFilter<EdgeSet> > acyclic_g(g, af);
-
-    vector<NFAVertex> topoOrder; /* actually reverse topological order */
+    vector<VertexT> topoOrder; /* actually reverse topological order */
     topoOrder.reserve(deadNodes.size());
-    topological_sort(acyclic_g, back_inserter(topoOrder),
-                     vertex_index_map(index_map));
+    topological_sort(acyclic_g, back_inserter(topoOrder));
 
     for (const auto &e : deadEdges) {
         u32 srcIdx = g[source(e, g)].index;
@@ -149,8 +150,7 @@ void findLoopReachable(const GraphT &g, const NFAVertex srcVertex,
         }
     }
 
-    for (auto it = topoOrder.rbegin(); it != topoOrder.rend(); ++it) {
-        NFAVertex v = *it;
+    for (VertexT v : reverse(topoOrder)) {
         for (const auto &e : in_edges_range(v, g)) {
             if (deadNodes[g[source(e, g)].index]) {
                 deadNodes[g[v].index] = true;
@@ -162,13 +162,13 @@ void findLoopReachable(const GraphT &g, const NFAVertex srcVertex,
 
 template <class GraphT>
 static
-void calcDepthFromSource(const NGHolder &graph, const GraphT &g,
+void calcDepthFromSource(const GraphT &g,
                          typename GraphT::vertex_descriptor srcVertex,
-                         const vector<bool> &deadNodes,
-                         vector<int> &dMin, vector<int> &dMax) {
+                         const vector<bool> &deadNodes, vector<int> &dMin,
+                         vector<int> &dMax) {
     typedef typename GraphT::edge_descriptor EdgeT;
 
-    const size_t numVerts = num_vertices(graph);
+    const size_t numVerts = num_vertices(g);
 
     NodeFilter<GraphT> nf(&deadNodes, &g);
     StartFilter<GraphT> sf(&g);
@@ -194,22 +194,20 @@ void calcDepthFromSource(const NGHolder &graph, const GraphT &g,
 
     using boost::make_iterator_property_map;
 
-    auto min_index_map = get(&NFAGraphVertexProps::index, mindist_g);
+    auto min_index_map = get(vertex_index, mindist_g);
 
     breadth_first_search(mindist_g, srcVertex,
-            boost::vertex_index_map(min_index_map).
                          visitor(make_bfs_visitor(record_distances(
-                             make_iterator_property_map(
-                                 dMin.begin(), min_index_map),
+                             make_iterator_property_map(dMin.begin(),
+                                                        min_index_map),
                              boost::on_tree_edge()))));
 
-    auto max_index_map = get(&NFAGraphVertexProps::index, maxdist_g);
+    auto max_index_map = get(vertex_index, maxdist_g);
 
     dag_shortest_paths(maxdist_g, srcVertex,
-            boost::vertex_index_map(max_index_map).
-            distance_map(make_iterator_property_map(dMax.begin(),
-                         max_index_map)).
-            weight_map(make_constant_property<EdgeT>(-1)));
+                       distance_map(make_iterator_property_map(dMax.begin(),
+                                                               max_index_map))
+                       .weight_map(make_constant_property<EdgeT>(-1)));
 
     for (size_t i = 0; i < numVerts; i++) {
         if (dMin[i] > DIST_UNREACHABLE) {
@@ -254,14 +252,14 @@ DepthMinMax getDepths(u32 idx, const vector<int> &dMin,
 
 template<class Graph, class Output>
 static
-void calcAndStoreDepth(const NGHolder &h, const Graph &g,
+void calcAndStoreDepth(const Graph &g,
                        const typename Graph::vertex_descriptor src,
                        const vector<bool> &deadNodes,
                        vector<int> &dMin /* util */,
                        vector<int> &dMax /* util */,
                        vector<Output> &depths,
                        DepthMinMax Output::*store) {
-    calcDepthFromSource(h, g, src, deadNodes, dMin, dMax);
+    calcDepthFromSource(g, src, deadNodes, dMin, dMax);
 
     for (auto v : vertices_range(g)) {
         u32 idx = g[v].index;
@@ -285,14 +283,14 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths) {
      * reachable from a loop need to be removed
      */
     vector<bool> deadNodes(numVertices);
-    findLoopReachable(g.g, g.start, deadNodes);
+    findLoopReachable(g, g.start, deadNodes);
 
     DEBUG_PRINTF("doing start\n");
-    calcAndStoreDepth(g, g.g, g.start, deadNodes, dMin, dMax,
-                      depths, &NFAVertexDepth::fromStart);
+    calcAndStoreDepth(g, g.start, deadNodes, dMin, dMax, depths,
+                      &NFAVertexDepth::fromStart);
     DEBUG_PRINTF("doing startds\n");
-    calcAndStoreDepth(g, g.g, g.startDs, deadNodes, dMin, dMax,
-                      depths, &NFAVertexDepth::fromStartDotStar);
+    calcAndStoreDepth(g, g.startDs, deadNodes, dMin, dMax, depths,
+                      &NFAVertexDepth::fromStartDotStar);
 }
 
 void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
@@ -305,8 +303,10 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
     vector<int> dMax;
 
     /* reverse the graph before walking it */
-    typedef reverse_graph<NFAGraph, const NFAGraph&> RevNFAGraph;
-    const RevNFAGraph rg(g.g);
+    typedef reverse_graph<NGHolder, const NGHolder &> RevNFAGraph;
+    const RevNFAGraph rg(g);
+
+    assert(num_vertices(g) == num_vertices(rg));
 
     /*
      * create a filtered graph for max depth calculations: all nodes/edges
@@ -317,12 +317,12 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
 
     DEBUG_PRINTF("doing accept\n");
     calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
-        g, rg, g.accept, deadNodes, dMin, dMax, depths,
+        rg, g.accept, deadNodes, dMin, dMax, depths,
         &NFAVertexRevDepth::toAccept);
     DEBUG_PRINTF("doing accepteod\n");
     deadNodes[NODE_ACCEPT] = true; // Hide accept->acceptEod edge.
     calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
-        g, rg, g.acceptEod, deadNodes, dMin, dMax, depths,
+        rg, g.acceptEod, deadNodes, dMin, dMax, depths,
         &NFAVertexRevDepth::toAcceptEod);
 }
 
@@ -340,31 +340,31 @@ void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
      * reachable from a loop need to be removed
      */
     vector<bool> deadNodes(numVertices);
-    findLoopReachable(g.g, g.start, deadNodes);
+    findLoopReachable(g, g.start, deadNodes);
 
     DEBUG_PRINTF("doing start\n");
-    calcAndStoreDepth<NFAGraph, NFAVertexBidiDepth>(
-        g, g.g, g.start, deadNodes, dMin, dMax, depths,
+    calcAndStoreDepth<NGHolder, NFAVertexBidiDepth>(
+        g, g.start, deadNodes, dMin, dMax, depths,
         &NFAVertexBidiDepth::fromStart);
     DEBUG_PRINTF("doing startds\n");
-    calcAndStoreDepth<NFAGraph, NFAVertexBidiDepth>(
-        g, g.g, g.startDs, deadNodes, dMin, dMax, depths,
+    calcAndStoreDepth<NGHolder, NFAVertexBidiDepth>(
+        g, g.startDs, deadNodes, dMin, dMax, depths,
         &NFAVertexBidiDepth::fromStartDotStar);
 
     /* Now go backwards */
-    typedef reverse_graph<NFAGraph, const NFAGraph&> RevNFAGraph;
-    const RevNFAGraph rg(g.g);
+    typedef reverse_graph<NGHolder, const NGHolder &> RevNFAGraph;
+    const RevNFAGraph rg(g);
     deadNodes.assign(numVertices, false);
     findLoopReachable(rg, g.acceptEod, deadNodes);
 
     DEBUG_PRINTF("doing accept\n");
     calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
-        g, rg, g.accept, deadNodes, dMin, dMax, depths,
+        rg, g.accept, deadNodes, dMin, dMax, depths,
         &NFAVertexBidiDepth::toAccept);
     DEBUG_PRINTF("doing accepteod\n");
     deadNodes[NODE_ACCEPT] = true; // Hide accept->acceptEod edge.
     calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
-        g, rg, g.acceptEod, deadNodes, dMin, dMax, depths,
+        rg, g.acceptEod, deadNodes, dMin, dMax, depths,
         &NFAVertexBidiDepth::toAcceptEod);
 }
 
@@ -374,10 +374,10 @@ void calcDepthsFrom(const NGHolder &g, const NFAVertex src,
     const size_t numVertices = num_vertices(g);
 
     vector<bool> deadNodes(numVertices);
-    findLoopReachable(g.g, g.start, deadNodes);
+    findLoopReachable(g, g.start, deadNodes);
 
     vector<int> dMin, dMax;
-    calcDepthFromSource(g, g.g, src, deadNodes, dMin, dMax);
+    calcDepthFromSource(g, src, deadNodes, dMin, dMax);
 
     depths.clear();
     depths.resize(numVertices);
diff --git a/src/nfagraph/ng_dominators.cpp b/src/nfagraph/ng_dominators.cpp
index 05650aaf..d01af994 100644
--- a/src/nfagraph/ng_dominators.cpp
+++ b/src/nfagraph/ng_dominators.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-16, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,37 +48,45 @@ using boost::make_iterator_property_map;
 namespace ue2 {
 
 template <class Graph>
-ue2::unordered_map<NFAVertex, NFAVertex> calcDominators(const Graph &g,
-                                                        NFAVertex source) {
+unordered_map<NFAVertex, NFAVertex> calcDominators(const Graph &g,
+                                typename Graph::vertex_descriptor source) {
+    using Vertex = typename Graph::vertex_descriptor;
     const size_t num_verts = num_vertices(g);
     auto index_map = get(&NFAGraphVertexProps::index, g);
 
     vector<size_t> dfnum(num_verts, 0);
-    vector<NFAVertex> parents(num_verts, Graph::null_vertex());
+    vector<Vertex> parents(num_verts, Graph::null_vertex());
 
     auto dfnum_map = make_iterator_property_map(dfnum.begin(), index_map);
     auto parent_map = make_iterator_property_map(parents.begin(), index_map);
-    vector<NFAVertex> vertices_by_dfnum(num_verts, Graph::null_vertex());
+    vector<Vertex> vertices_by_dfnum(num_verts, Graph::null_vertex());
 
     // Output map.
-    unordered_map<NFAVertex, NFAVertex> doms;
+    unordered_map<Vertex, Vertex> doms;
     auto dom_map = make_assoc_property_map(doms);
 
     boost_ue2::lengauer_tarjan_dominator_tree(g, source, index_map, dfnum_map,
                                               parent_map, vertices_by_dfnum,
                                               dom_map);
 
-    return doms;
+    /* Translate back to an NFAVertex map */
+    unordered_map<NFAVertex, NFAVertex> doms2;
+    for (const auto &e : doms) {
+        NFAVertex f(e.first);
+        NFAVertex s(e.second);
+        doms2[f] = s;
+    }
+    return doms2;
 }
 
-ue2::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g) {
+unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
-    return calcDominators(g.g, g.start);
+    return calcDominators(g, g.start);
 }
 
-ue2::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g) {
+unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
-    return calcDominators(boost::reverse_graph<NFAGraph, const NFAGraph &>(g.g),
+    return calcDominators(boost::reverse_graph<NGHolder, const NGHolder &>(g),
                           g.acceptEod);
 }
 
diff --git a/src/nfagraph/ng_dump.cpp b/src/nfagraph/ng_dump.cpp
index 57668caf..fc840f25 100644
--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -234,9 +234,9 @@ public:
     void operator()(ostream& os, const EdgeT& e) const {
         // Edge label. Print priority.
         os << "[fontsize=9,label=\"";
-        // If it's an edge from start, print top id.
-        if (is_any_start(source(e, g), g) && !is_any_start(target(e, g), g)) {
-            os << "TOP " << g[e].top << "\\n";
+        // print tops if any set.
+        if (!g[e].tops.empty()) {
+            os << "TOP " << as_string_list(g[e].tops) << "\\n";
         }
 
         // If it's an assert vertex, then display its info.
@@ -285,7 +285,7 @@ void dumpGraphImpl(const char *name, const GraphT &g,
 }
 
 // manual instantiation of templated dumpGraph above.
-template void dumpGraphImpl(const char *, const NFAGraph &);
+template void dumpGraphImpl(const char *, const NGHolder &);
 
 void dumpDotWrapperImpl(const NGWrapper &nw, const char *name,
                         const Grey &grey) {
@@ -293,7 +293,7 @@ void dumpDotWrapperImpl(const NGWrapper &nw, const char *name,
         stringstream ss;
         ss << grey.dumpPath << "Expr_" << nw.expressionIndex << "_" << name << ".dot";
         DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
-        dumpGraphImpl(ss.str().c_str(), nw.g);
+        dumpGraphImpl(ss.str().c_str(), nw);
     }
 }
 
@@ -304,7 +304,7 @@ void dumpComponentImpl(const NGHolder &g, const char *name, u32 expr,
         ss << grey.dumpPath << "Comp_" << expr << "-" << comp << "_"
            << name << ".dot";
         DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
-        dumpGraphImpl(ss.str().c_str(), g.g);
+        dumpGraphImpl(ss.str().c_str(), g);
     }
 }
 
@@ -315,7 +315,7 @@ void dumpSomSubComponentImpl(const NGHolder &g, const char *name, u32 expr,
         ss << grey.dumpPath << "Comp_" << expr << "-" << comp << "_"
            <<  name << "_" << plan << ".dot";
         DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
-        dumpGraphImpl(ss.str().c_str(), g.g);
+        dumpGraphImpl(ss.str().c_str(), g);
     }
 }
 
@@ -325,7 +325,7 @@ void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
         stringstream ss;
         ss << grey.dumpPath << "Holder_X_" << stageNumber
            << "-" << stageName << ".dot";
-        dumpGraphImpl(ss.str().c_str(), h.g);
+        dumpGraphImpl(ss.str().c_str(), h);
     }
 }
 
@@ -337,7 +337,7 @@ void dumpHolderImpl(const NGHolder &h,
         stringstream ss;
         ss << grey.dumpPath << "Holder_X_" << stageNumber
            << "-" << stageName << ".dot";
-        dumpGraphImpl(ss.str().c_str(), h.g, region_map);
+        dumpGraphImpl(ss.str().c_str(), h, region_map);
     }
 }
 
diff --git a/src/nfagraph/ng_edge_redundancy.cpp b/src/nfagraph/ng_edge_redundancy.cpp
index 5944cfef..3ce62c41 100644
--- a/src/nfagraph/ng_edge_redundancy.cpp
+++ b/src/nfagraph/ng_edge_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -297,9 +297,8 @@ bool checkFwdCandidate(const NGHolder &g, NFAVertex fixed_src,
         return false;
     }
 
-    DEBUG_PRINTF("edge (%u, %u) killed by edge (%u, %u)\n",
-                 g[w].index, g[v].index,
-                 g[fixed_src].index, g[v].index);
+    DEBUG_PRINTF("edge (%zu, %zu) killed by edge (%zu, %zu)\n",
+                 g[w].index, g[v].index, g[fixed_src].index, g[v].index);
     return true;
 }
 
@@ -415,7 +414,7 @@ bool removeEdgeRedundancyFwd(NGHolder &g, bool ignore_starts) {
         pred(g, u, &parents_u);
 
         done.clear();
-        if (hasGreaterOutDegree(1, u, g)) {
+        if (out_degree(u, g) > 1) {
             checkLargeOutU(g, u, parents_u, possible_w, done, &dead);
         } else {
             checkSmallOutU(g, u, parents_u, done, &dead);
@@ -460,7 +459,7 @@ bool removeSiblingsOfStartDotStar(NGHolder &g) {
     vector<NFAEdge> dead;
 
     for (auto v : adjacent_vertices_range(g.startDs, g)) {
-        DEBUG_PRINTF("checking %u\n", g[v].index);
+        DEBUG_PRINTF("checking %zu\n", g[v].index);
         if (is_special(v, g)) {
             continue;
         }
@@ -470,8 +469,7 @@ bool removeSiblingsOfStartDotStar(NGHolder &g) {
             if (is_special(u, g)) {
                 continue;
             }
-            DEBUG_PRINTF("removing %u->%u\n", g[u].index,
-                         g[v].index);
+            DEBUG_PRINTF("removing %zu->%zu\n", g[u].index, g[v].index);
             dead.push_back(e);
         }
     }
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index d0ab7c4a..32a392a6 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -38,17 +38,16 @@
 #include "ng_util.h"
 #include "util/compile_context.h"
 #include "util/graph_range.h"
+#include "util/make_unique.h"
 #include "util/ue2_containers.h"
 
 #include <algorithm>
+#include <memory>
 #include <set>
 #include <stack>
 #include <vector>
 
-#include <boost/ptr_container/ptr_vector.hpp>
-
 using namespace std;
-using boost::ptr_vector;
 
 namespace ue2 {
 
@@ -72,17 +71,17 @@ struct VertexInfoPtrCmp {
 class VertexInfo {
 public:
     VertexInfo(NFAVertex v_in, const NGHolder &g)
-        : v(v_in), vert_index(g[v].index), cr(g[v].char_reach), edge_top(~0),
+        : v(v_in), vert_index(g[v].index), cr(g[v].char_reach),
           equivalence_class(~0), vertex_flags(g[v].assert_flags) {}
 
     flat_set<VertexInfo *, VertexInfoPtrCmp> pred; //!< predecessors of this vertex
     flat_set<VertexInfo *, VertexInfoPtrCmp> succ; //!< successors of this vertex
     NFAVertex v;
-    u32 vert_index;
+    size_t vert_index;
     CharReach cr;
     CharReach pred_cr;
     CharReach succ_cr;
-    unsigned edge_top;
+    flat_set<u32> edge_tops; /**< tops on edge from start */
     unsigned equivalence_class;
     unsigned vertex_flags;
 };
@@ -120,15 +119,15 @@ public:
               EquivalenceType eq)
         : /* reports only matter for right-equiv */
           rs(eq == RIGHT_EQUIVALENCE ? g[vi.v].reports : flat_set<ReportID>()),
-          vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
+          vertex_flags(vi.vertex_flags), edge_tops(vi.edge_tops), cr(vi.cr),
           adjacent_cr(eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr),
           /* treat non-special vertices the same */
-          node_type(min(g[vi.v].index, u32{N_SPECIALS})), depth(d_in) {}
+          node_type(min(g[vi.v].index, size_t{N_SPECIALS})), depth(d_in) {}
 
     bool operator==(const ClassInfo &b) const {
         return node_type == b.node_type && depth.d1 == b.depth.d1 &&
                depth.d2 == b.depth.d2 && cr == b.cr &&
-               adjacent_cr == b.adjacent_cr && edge_top == b.edge_top &&
+               adjacent_cr == b.adjacent_cr && edge_tops == b.edge_tops &&
                vertex_flags == b.vertex_flags && rs == b.rs;
     }
 
@@ -136,7 +135,6 @@ public:
         size_t val = 0;
         boost::hash_combine(val, boost::hash_range(begin(c.rs), end(c.rs)));
         boost::hash_combine(val, c.vertex_flags);
-        boost::hash_combine(val, c.edge_top);
         boost::hash_combine(val, c.cr);
         boost::hash_combine(val, c.adjacent_cr);
         boost::hash_combine(val, c.node_type);
@@ -148,7 +146,7 @@ public:
 private:
     flat_set<ReportID> rs; /* for right equiv only */
     unsigned vertex_flags;
-    u32 edge_top;
+    flat_set<u32> edge_tops;
     CharReach cr;
     CharReach adjacent_cr;
     unsigned node_type;
@@ -277,47 +275,47 @@ bool hasEdgeAsserts(NFAVertex v, const NGHolder &g) {
 
 // populate VertexInfo table
 static
-ptr_vector<VertexInfo> getVertexInfos(const NGHolder &g) {
+vector<unique_ptr<VertexInfo>> getVertexInfos(const NGHolder &g) {
     const size_t num_verts = num_vertices(g);
 
-    ptr_vector<VertexInfo> infos;
+    vector<unique_ptr<VertexInfo>> infos;
     infos.reserve(num_verts * 2);
 
     vector<VertexInfo *> vertex_map; // indexed by vertex_index property
     vertex_map.resize(num_verts);
 
     for (auto v : vertices_range(g)) {
-        VertexInfo *vi = new VertexInfo(v, g);
-
-        // insert our new shiny VertexInfo into the info map
-        infos.push_back(vi);
-
-        vertex_map[g[v].index] = vi;
+        infos.push_back(make_unique<VertexInfo>(v, g));
+        vertex_map[g[v].index] = infos.back().get();
     }
 
-    // now, go through each vertex and populate its predecessor and successor lists
-    for (VertexInfo &cur_vi : infos) {
-        // find predecessors
-        for (const auto &e : in_edges_range(cur_vi.v, g)) {
-            NFAVertex u = source(e, g);
-            VertexInfo *vmi = vertex_map[g[u].index];
+    // now, go through each vertex and populate its predecessor and successor
+    // lists
+    for (auto &vi : infos) {
+        assert(vi);
+        NFAVertex v = vi->v;
 
-            cur_vi.pred_cr |= vmi->cr;
-            cur_vi.pred.insert(vmi);
+        // find predecessors
+        for (const auto &e : in_edges_range(v, g)) {
+            NFAVertex u = source(e, g);
+            VertexInfo *u_vi = vertex_map[g[u].index];
+
+            vi->pred_cr |= u_vi->cr;
+            vi->pred.insert(u_vi);
 
             // also set up edge tops
             if (is_triggered(g) && u == g.start) {
-                cur_vi.edge_top = g[e].top;
+                vi->edge_tops = g[e].tops;
             }
         }
 
         // find successors
-        for (auto w : adjacent_vertices_range(cur_vi.v, g)) {
-            VertexInfo *vmi = vertex_map[g[w].index];
-            cur_vi.succ_cr |= vmi->cr;
-            cur_vi.succ.insert(vmi);
+        for (auto w : adjacent_vertices_range(v, g)) {
+            VertexInfo *w_vi = vertex_map[g[w].index];
+            vi->succ_cr |= w_vi->cr;
+            vi->succ.insert(w_vi);
         }
-        assert(!hasEdgeAsserts(cur_vi.v, g));
+        assert(!hasEdgeAsserts(vi->v, g));
     }
 
     return infos;
@@ -325,7 +323,7 @@ ptr_vector<VertexInfo> getVertexInfos(const NGHolder &g) {
 
 // store equivalence class in VertexInfo for each vertex
 static
-vector<VertexInfoSet> partitionGraph(ptr_vector<VertexInfo> &infos,
+vector<VertexInfoSet> partitionGraph(vector<unique_ptr<VertexInfo>> &infos,
                                      WorkQueue &work_queue, const NGHolder &g,
                                      EquivalenceType eq) {
     const size_t num_verts = infos.size();
@@ -350,28 +348,30 @@ vector<VertexInfoSet> partitionGraph(ptr_vector<VertexInfo> &infos,
     }
 
     // partition the graph based on CharReach
-    for (VertexInfo &vi : infos) {
+    for (auto &vi : infos) {
+        assert(vi);
+
         ClassInfo::ClassDepth depth;
 
         if (eq == LEFT_EQUIVALENCE) {
-            depth = depths[vi.vert_index];
+            depth = depths[vi->vert_index];
         } else {
-            depth = rdepths[vi.vert_index];
+            depth = rdepths[vi->vert_index];
         }
-        ClassInfo ci(g, vi, depth, eq);
+        ClassInfo ci(g, *vi, depth, eq);
 
         auto ii = classinfomap.find(ci);
         if (ii == classinfomap.end()) {
             // vertex is in a new equivalence class by itself.
             unsigned eq_class = classes.size();
-            vi.equivalence_class = eq_class;
-            classes.push_back({&vi});
+            vi->equivalence_class = eq_class;
+            classes.push_back({vi.get()});
             classinfomap.emplace(move(ci), eq_class);
         } else {
             // vertex is added to an existing class.
             unsigned eq_class = ii->second;
-            vi.equivalence_class = eq_class;
-            classes.at(eq_class).insert(&vi);
+            vi->equivalence_class = eq_class;
+            classes.at(eq_class).insert(vi.get());
 
             // we now know that this particular class has more than one
             // vertex, so we add it to the work queue
@@ -501,8 +501,9 @@ bool require_separate_eod_vertex(const VertexInfoSet &vert_infos,
 }
 
 static
-void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
-                VertexInfoSet &cur_class_vertices, set<NFAVertex> *toRemove) {
+void mergeClass(vector<unique_ptr<VertexInfo>> &infos, NGHolder &g,
+                unsigned eq_class, VertexInfoSet &cur_class_vertices,
+                set<NFAVertex> *toRemove) {
     DEBUG_PRINTF("Replacing %zd vertices from equivalence class %u with a "
                  "single vertex.\n", cur_class_vertices.size(), eq_class);
 
@@ -530,9 +531,9 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
                                                * props */
     g[new_v].reports.clear(); /* populated as we pull in succs */
 
-    VertexInfo *new_vertex_info = new VertexInfo(new_v, g);
     // store this vertex in our global vertex list
-    infos.push_back(new_vertex_info);
+    infos.push_back(make_unique<VertexInfo>(new_v, g));
+    VertexInfo *new_vertex_info = infos.back().get();
 
     NFAVertex new_v_eod = NGHolder::null_vertex();
     VertexInfo *new_vertex_info_eod = nullptr;
@@ -540,11 +541,11 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
     if (require_separate_eod_vertex(cur_class_vertices, g)) {
         new_v_eod = clone_vertex(g, old_v);
         g[new_v_eod].reports.clear();
-        new_vertex_info_eod = new VertexInfo(new_v_eod, g);
-        infos.push_back(new_vertex_info_eod);
+        infos.push_back(make_unique<VertexInfo>(new_v_eod, g));
+        new_vertex_info_eod = infos.back().get();
     }
 
-    const unsigned edgetop = (*cur_class_vertices.begin())->edge_top;
+    const auto &edgetops = (*cur_class_vertices.begin())->edge_tops;
     for (VertexInfo *old_vertex_info : cur_class_vertices) {
         assert(old_vertex_info->equivalence_class == eq_class);
 
@@ -563,22 +564,24 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
             pred_info->succ.erase(old_vertex_info);
 
             // if edge doesn't exist, create it
-            NFAEdge e = add_edge_if_not_present(pred_info->v, new_v, g).first;
+            NFAEdge e = add_edge_if_not_present(pred_info->v, new_v, g);
 
-            // put edge top, if applicable
-            if (edgetop != (unsigned) -1) {
-                g[e].top = edgetop;
+            // put edge tops, if applicable
+            if (!edgetops.empty()) {
+                assert(g[e].tops.empty() || g[e].tops == edgetops);
+                g[e].tops = edgetops;
             }
 
             pred_info->succ.insert(new_vertex_info);
 
             if (new_v_eod) {
                 NFAEdge ee = add_edge_if_not_present(pred_info->v, new_v_eod,
-                                                     g).first;
+                                                     g);
 
-                // put edge top, if applicable
-                if (edgetop != (unsigned) -1) {
-                    g[ee].top = edgetop;
+                // put edge tops, if applicable
+                if (!edgetops.empty()) {
+                    assert(g[e].tops.empty() || g[e].tops == edgetops);
+                    g[ee].tops = edgetops;
                 }
 
                 pred_info->succ.insert(new_vertex_info_eod);
@@ -626,7 +629,8 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
 // report behaviour with a single vertex).
 static
 bool mergeEquivalentClasses(vector<VertexInfoSet> &classes,
-                            ptr_vector<VertexInfo> &infos, NGHolder &g) {
+                            vector<unique_ptr<VertexInfo>> &infos,
+                            NGHolder &g) {
     bool merged = false;
     set<NFAVertex> toRemove;
 
@@ -656,7 +660,7 @@ bool reduceGraphEquivalences(NGHolder &g, EquivalenceType eq_type) {
 
     // get information on every vertex in the graph
     // new vertices are allocated here, and stored in infos
-    ptr_vector<VertexInfo> infos = getVertexInfos(g);
+    auto infos = getVertexInfos(g);
 
     // partition the graph
     auto classes = partitionGraph(infos, work_queue, g, eq_type);
@@ -674,7 +678,7 @@ bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
         DEBUG_PRINTF("equivalence processing disabled in grey box\n");
         return false;
     }
-    g.renumberVertices();
+    renumber_vertices(g);
 
     // Cheap check: if all the non-special vertices have in-degree one and
     // out-degree one, there's no redundancy in this here graph and we can
diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index 4ffd89c0..9d904894 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -183,8 +183,6 @@ flat_set<NFAVertex> execute_graph(const NGHolder &g,
     return getVertices(work_states, info);
 }
 
-typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
-
 namespace {
 class eg_visitor : public boost::default_dfs_visitor {
 public:
@@ -195,13 +193,14 @@ public:
           info(info_in), input_g(input_g_in), states(states_in),
           succs(vertex_count) {}
 
-    void finish_vertex(NFAVertex input_v, const RevNFAGraph &) {
+    void finish_vertex(NFAVertex input_v,
+                   const boost::reverse_graph<NGHolder, const NGHolder &> &) {
         if (input_v == input_g.accept) {
             return;
         }
         assert(input_v != input_g.acceptEod);
 
-        DEBUG_PRINTF("finished p%u\n", input_g[input_v].index);
+        DEBUG_PRINTF("finished p%zu\n", input_g[input_v].index);
 
         /* finish vertex is called on vertex --> implies that all its parents
          * (in the forward graph) are also finished. Our parents will have
@@ -236,7 +235,7 @@ public:
         /* we need to push into all our (forward) children their successors
          * from us. */
         for (auto v : adjacent_vertices_range(input_v, input_g)) {
-            DEBUG_PRINTF("pushing our states to pstate %u\n",
+            DEBUG_PRINTF("pushing our states to pstate %zu\n",
                          input_g[v].index);
             if (v == input_g.startDs) {
                 /* no need for intra start edges */
@@ -289,7 +288,7 @@ flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
     map<NFAVertex, boost::default_color_type> colours;
     /* could just a topo order, but really it is time to pull a slightly bigger
      * gun: DFS */
-    RevNFAGraph revg(input_dag.g);
+    boost::reverse_graph<NGHolder, const NGHolder &> revg(input_dag);
     map<NFAVertex, dynamic_bitset<> > dfs_states;
 
     auto info = makeInfoTable(running_g);
@@ -308,7 +307,7 @@ flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
 #ifdef DEBUG
     DEBUG_PRINTF("  output rstates:");
     for (const auto &v : states) {
-        printf(" %u", running_g[v].index);
+        printf(" %zu", running_g[v].index);
     }
     printf("\n");
 #endif
diff --git a/src/nfagraph/ng_expr_info.cpp b/src/nfagraph/ng_expr_info.cpp
index cfd34ce6..b43c7fd1 100644
--- a/src/nfagraph/ng_expr_info.cpp
+++ b/src/nfagraph/ng_expr_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -118,7 +118,7 @@ void checkVertex(const ReportManager &rm, const NGWrapper &w, NFAVertex v,
             rd.max = min(rd.max, max_offset);
         }
 
-        DEBUG_PRINTF("vertex %u report %u: %s\n", w[v].index, report_id,
+        DEBUG_PRINTF("vertex %zu report %u: %s\n", w[v].index, report_id,
                       rd.str().c_str());
 
         info = unionDepthMinMax(info, rd);
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index bc101df2..a504ac50 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -172,8 +172,7 @@ void updateReportBounds(ReportManager &rm, NGWrapper &g, NFAVertex accept,
             new_reports.insert(rm.getInternalId(ir));
         }
 
-        DEBUG_PRINTF("swapping reports on vertex %u\n",
-                     g[v].index);
+        DEBUG_PRINTF("swapping reports on vertex %zu\n", g[v].index);
         reports.swap(new_reports);
     }
 }
@@ -286,8 +285,8 @@ bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
         add_edge(u, v, g);
     }
 
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
 
     return true;
 }
@@ -309,7 +308,7 @@ NFAVertex findSingleCyclic(const NGHolder &g) {
     }
 
     if (v != NGHolder::null_vertex()) {
-        DEBUG_PRINTF("cyclic is %u\n", g[v].index);
+        DEBUG_PRINTF("cyclic is %zu\n", g[v].index);
         assert(!is_special(v, g));
     }
     return v;
@@ -380,10 +379,9 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     // Walk from the start vertex to the cyclic state and ensure we have a
     // chain of vertices.
     while (v != cyclic) {
-        DEBUG_PRINTF("vertex %u\n", g[v].index);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
         width++;
-        tie(ai, ae) = adjacent_vertices(v, g);
-        set<NFAVertex> succ(ai, ae);
+        auto succ = succs(v, g);
         if (contains(succ, cyclic)) {
             if (succ.size() == 1) {
                 v = cyclic;
@@ -419,10 +417,9 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     // Walk from the cyclic state to an accept and ensure we have a chain of
     // vertices.
     while (!is_any_accept(v, g)) {
-        DEBUG_PRINTF("vertex %u\n", g[v].index);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
         width++;
-        tie(ai, ae) = adjacent_vertices(v, g);
-        set<NFAVertex> succ(ai, ae);
+        auto succ = succs(v, g);
         if (succ.size() != 1) {
             DEBUG_PRINTF("bad form\n");
             return false;
@@ -437,7 +434,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     DEBUG_PRINTF("adjusting width by %d\n", offsetAdjust);
     width += offsetAdjust;
 
-    DEBUG_PRINTF("width=%u, vertex %u is cyclic\n", width,
+    DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n", width,
                   g[cyclic].index);
 
     if (width >= g.min_length) {
@@ -450,7 +447,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     vector<NFAVertex> preds;
     vector<NFAEdge> dead;
     for (auto u : inv_adjacent_vertices_range(cyclic, g)) {
-        DEBUG_PRINTF("pred %u\n", g[u].index);
+        DEBUG_PRINTF("pred %zu\n", g[u].index);
         if (u == cyclic) {
             continue;
         }
@@ -486,8 +483,8 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
         add_edge(u, cyclic, g);
     }
 
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
     clearReports(g);
 
     g.min_length = 0;
@@ -544,8 +541,7 @@ bool isEdgePrunable(const NGWrapper &g,
     const NFAVertex u = source(e, g);
     const NFAVertex v = target(e, g);
 
-    DEBUG_PRINTF("edge (%u,%u)\n", g[u].index,
-                 g[v].index);
+    DEBUG_PRINTF("edge (%zu,%zu)\n", g[u].index, g[v].index);
 
     // Leave our special-to-special edges alone.
     if (is_special(u, g) && is_special(v, g)) {
@@ -718,8 +714,7 @@ static
 bool isUnanchored(const NGHolder &g) {
     for (auto v : adjacent_vertices_range(g.start, g)) {
         if (!edge(g.startDs, v, g).second) {
-            DEBUG_PRINTF("fail, %u is anchored vertex\n",
-                         g[v].index);
+            DEBUG_PRINTF("fail, %zu is anchored vertex\n", g[v].index);
             return false;
         }
     }
@@ -864,7 +859,7 @@ void handleExtendedParams(ReportManager &rm, NGWrapper &g,
             }
         }
     }
-    //dumpGraph("final.dot", g.g);
+    //dumpGraph("final.dot", g);
 
     if (!hasExtParams(g)) {
         return;
diff --git a/src/nfagraph/ng_fixed_width.cpp b/src/nfagraph/ng_fixed_width.cpp
index 46d77913..978dad44 100644
--- a/src/nfagraph/ng_fixed_width.cpp
+++ b/src/nfagraph/ng_fixed_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -77,7 +77,7 @@ bool findMask(const NGHolder &g, vector<CharReach> *mask, bool *anchored,
     NFAVertex v = *succs.begin();
 
     while (true) {
-        DEBUG_PRINTF("validating vertex %u\n", g[v].index);
+        DEBUG_PRINTF("validating vertex %zu\n", g[v].index);
 
         assert(v != g.acceptEod);
 
diff --git a/src/nfagraph/ng_graph.h b/src/nfagraph/ng_graph.h
deleted file mode 100644
index 64b32839..00000000
--- a/src/nfagraph/ng_graph.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Definition of the NFAGraph type used for all NFA graph
- * representations.
- *
- * Note that most of the time we don't work on a bare NFAGraph: instead
- * we use an NGHolder, which wraps the graph and defines our special vertices,
- * etc.
- */
-
-#ifndef NG_GRAPH_H
-#define NG_GRAPH_H
-
-#include "util/charreach.h"
-#include "util/ue2_containers.h"
-#include "ue2common.h"
-
-#include <boost/graph/adjacency_iterator.hpp>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/graph_traits.hpp>
-
-namespace ue2 {
-
-/** \brief Properties associated with each vertex in an NFAGraph. */
-struct NFAGraphVertexProps {
-    /** \brief Set of characters on which this vertex is reachable. */
-    CharReach char_reach;
-
-    /** \brief Set of reports raised by this vertex. */
-    ue2::flat_set<ReportID> reports;
-
-    /** \brief Unique index for this vertex, used for BGL algorithms. */
-    u32 index = 0;
-
-    /** \brief Flags associated with assertions. */
-    u32 assert_flags = 0;
-};
-
-/** \brief Properties associated with each edge in an NFAGraph. */
-struct NFAGraphEdgeProps {
-    /** \brief Unique index for this edge, used for BGL algorithms. */
-    u32 index = 0;
-
-    /** \brief For graphs that will be implemented as multi-top engines, this
-     * specifies the top event. Only used on edges from the start vertex. */
-    u32 top = 0;
-
-    /** \brief Flags associated with assertions. */
-    u32 assert_flags = 0;
-};
-
-// For flexibility: boost::listS, boost::listS for out-edge and vertex lists.
-// boost::bidirectionalS for directed graph so that we can get at in-edges.
-typedef boost::adjacency_list<boost::listS,
-                              boost::listS,
-                              boost::bidirectionalS,
-                              NFAGraphVertexProps,
-                              NFAGraphEdgeProps> NFAGraph;
-
-typedef NFAGraph::vertex_descriptor NFAVertex;
-typedef NFAGraph::edge_descriptor NFAEdge;
-
-/** \brief vertex_index values for special nodes in the NFAGraph. */
-enum SpecialNodes {
-    /** \brief Anchored start vertex. WARNING: this may be triggered at various
-     * locations (not just zero) for triggered graphs. */
-    NODE_START,
-
-    /** \brief Unanchored start-dotstar vertex. WARNING: this may not have a
-     * proper self-loop. */
-    NODE_START_DOTSTAR,
-
-    /** \brief Accept vertex. All vertices that can match at arbitrary offsets
-     * must have an edge to this vertex. */
-    NODE_ACCEPT,
-
-    /** \brief Accept-EOD vertex. Vertices that must raise a match at EOD only
-     * must have an edge to this vertex. */
-    NODE_ACCEPT_EOD,
-
-    /** \brief Sentinel, number of special vertices. */
-    N_SPECIALS
-};
-
-} // namespace ue2
-
-#endif
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index e70b7708..e4be14c3 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -35,13 +35,12 @@
 #include "nfa/goughcompile.h"
 #include "ng_holder.h"
 #include "ng_mcclellan_internal.h"
-#include "ng_restructuring.h"
 #include "ng_som_util.h"
 #include "ng_squash.h"
-#include "ng_util.h"
 #include "util/bitfield.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/ue2_containers.h"
@@ -118,11 +117,11 @@ public:
     using StateMap = typename Automaton_Traits::StateMap;
 
 protected:
-    Automaton_Base(const NGHolder &graph_in,
-                   const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Base(const NGHolder &graph_in, som_type som,
                    const vector<vector<CharReach>> &triggers,
                    bool unordered_som)
-        : graph(graph_in), numStates(num_vertices(graph)), unused(unused_in),
+        : graph(graph_in), numStates(num_vertices(graph)),
+          unused(getRedundantStarts(graph_in)),
           init(Automaton_Traits::init_states(numStates)),
           initDS(Automaton_Traits::init_states(numStates)),
           squash(Automaton_Traits::init_states(numStates)),
@@ -210,7 +209,7 @@ public:
 
     const NGHolder &graph;
     const u32 numStates;
-    const flat_set<NFAVertex> &unused;
+    const flat_set<NFAVertex> unused;
 
     array<u16, ALPHABET_SIZE> alpha;
     array<u16, ALPHABET_SIZE> unalpha;
@@ -251,10 +250,9 @@ struct Big_Traits {
 
 class Automaton_Big : public Automaton_Base<Big_Traits> {
 public:
-    Automaton_Big(const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Big(const NGHolder &graph_in, som_type som,
                   const vector<vector<CharReach>> &triggers, bool unordered_som)
-        : Automaton_Base(graph_in, unused_in, som, triggers, unordered_som) {}
+        : Automaton_Base(graph_in, som, triggers, unordered_som) {}
 };
 
 struct Graph_Traits {
@@ -278,11 +276,10 @@ struct Graph_Traits {
 
 class Automaton_Graph : public Automaton_Base<Graph_Traits> {
 public:
-    Automaton_Graph(const NGHolder &graph_in,
-                    const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Graph(const NGHolder &graph_in, som_type som,
                     const vector<vector<CharReach>> &triggers,
                     bool unordered_som)
-        : Automaton_Base(graph_in, unused_in, som, triggers, unordered_som) {}
+        : Automaton_Base(graph_in, som, triggers, unordered_som) {}
 };
 
 class Automaton_Haig_Merge {
@@ -452,7 +449,7 @@ void haig_do_preds(const NGHolder &g, const stateset &nfa_states,
         NFAVertex v = state_mapping[i];
         s32 slot_id = g[v].index;
 
-        DEBUG_PRINTF("d vertex %u\n", g[v].index);
+        DEBUG_PRINTF("d vertex %zu\n", g[v].index);
         vector<u32> &out_map = preds[slot_id];
         for (auto u : inv_adjacent_vertices_range(v, g)) {
             out_map.push_back(g[u].index);
@@ -493,7 +490,7 @@ void haig_note_starts(const NGHolder &g, map<u32, u32> *out) {
 
     for (auto v : vertices_range(g)) {
         if (is_any_start_inc_virtual(v, g)) {
-            DEBUG_PRINTF("%u creates new som value\n", g[v].index);
+            DEBUG_PRINTF("%zu creates new som value\n", g[v].index);
             out->emplace(g[v].index, 0U);
             continue;
         }
@@ -504,7 +501,7 @@ void haig_note_starts(const NGHolder &g, map<u32, u32> *out) {
 
         const DepthMinMax &d = depths[g[v].index];
         if (d.min == d.max && d.min.is_finite()) {
-            DEBUG_PRINTF("%u is fixed at %u\n", g[v].index, (u32)d.min);
+            DEBUG_PRINTF("%zu is fixed at %u\n", g[v].index, (u32)d.min);
             out->emplace(g[v].index, d.min);
         }
     }
@@ -512,15 +509,14 @@ void haig_note_starts(const NGHolder &g, map<u32, u32> *out) {
 
 template<class Auto>
 static
-bool doHaig(const NGHolder &g,
-            const flat_set<NFAVertex> &unused,
-            som_type som, const vector<vector<CharReach>> &triggers,
-            bool unordered_som, raw_som_dfa *rdfa) {
+bool doHaig(const NGHolder &g, som_type som,
+            const vector<vector<CharReach>> &triggers, bool unordered_som,
+            raw_som_dfa *rdfa) {
     u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
                                                      a fight */
     typedef typename Auto::StateSet StateSet;
     vector<StateSet> nfa_state_map;
-    Auto n(g, unused, som, triggers, unordered_som);
+    Auto n(g, som, triggers, unordered_som);
     try {
         if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
             DEBUG_PRINTF("state limit exceeded\n");
@@ -550,9 +546,9 @@ bool doHaig(const NGHolder &g,
         haig_do_preds(g, source_states, n.v_by_index,
                       rdfa->state_som.back().preds);
 
-        haig_do_report(g, unused, g.accept, source_states, n.v_by_index,
+        haig_do_report(g, n.unused, g.accept, source_states, n.v_by_index,
                        rdfa->state_som.back().reports);
-        haig_do_report(g, unused, g.acceptEod, source_states, n.v_by_index,
+        haig_do_report(g, n.unused, g.acceptEod, source_states, n.v_by_index,
                        rdfa->state_som.back().reports_eod);
     }
 
@@ -577,8 +573,6 @@ attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision,
     assert(allMatchStatesHaveReports(g));
     assert(hasCorrectlyNumberedVertices(g));
 
-    auto unused = findUnusedStates(g);
-
     u32 numStates = num_vertices(g);
     if (numStates > HAIG_MAX_NFA_STATE) {
         DEBUG_PRINTF("giving up... looks too big\n");
@@ -592,12 +586,11 @@ attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision,
     bool rv;
     if (numStates <= NFA_STATE_LIMIT) {
         /* fast path */
-        rv = doHaig<Automaton_Graph>(g, unused, som, triggers, unordered_som,
+        rv = doHaig<Automaton_Graph>(g, som, triggers, unordered_som,
                                      rdfa.get());
     } else {
         /* not the fast path */
-        rv = doHaig<Automaton_Big>(g, unused, som, triggers, unordered_som,
-                                   rdfa.get());
+        rv = doHaig<Automaton_Big>(g, som, triggers, unordered_som, rdfa.get());
     }
 
     if (!rv) {
diff --git a/src/nfagraph/ng_holder.cpp b/src/nfagraph/ng_holder.cpp
index 53566891..a2fbb288 100644
--- a/src/nfagraph/ng_holder.cpp
+++ b/src/nfagraph/ng_holder.cpp
@@ -36,123 +36,33 @@ using namespace std;
 namespace ue2 {
 
 // internal use only
-static NFAVertex addSpecialVertex(NFAGraph &g, SpecialNodes id) {
-    NFAVertex v = add_vertex(g);
+static NFAVertex addSpecialVertex(NGHolder &g, SpecialNodes id) {
+    NFAVertex v(add_vertex(g));
     g[v].index = id;
     return v;
 }
 
-NGHolder::NGHolder(void)
- : g(),
-   // add initial special nodes
-   start(addSpecialVertex(g, NODE_START)),
-   startDs(addSpecialVertex(g, NODE_START_DOTSTAR)),
-   accept(addSpecialVertex(g, NODE_ACCEPT)),
-   acceptEod(addSpecialVertex(g, NODE_ACCEPT_EOD)),
-   // misc data
-   numVertices(N_SPECIALS),
-   numEdges(0),
-   isValidNumEdges(true),
-   isValidNumVertices(true) {
-
-    // wire up some fake edges for the stylized bits of the NFA
-    add_edge(start, startDs, *this);
-    add_edge(startDs, startDs, *this);
-    add_edge(accept, acceptEod, *this);
-
-    g[start].char_reach.setall();
-    g[startDs].char_reach.setall();
-}
-
 NGHolder::NGHolder(nfa_kind k)
- : kind (k), g(),
+ : kind (k),
    // add initial special nodes
-   start(addSpecialVertex(g, NODE_START)),
-   startDs(addSpecialVertex(g, NODE_START_DOTSTAR)),
-   accept(addSpecialVertex(g, NODE_ACCEPT)),
-   acceptEod(addSpecialVertex(g, NODE_ACCEPT_EOD)),
-   // misc data
-   numVertices(N_SPECIALS),
-   numEdges(0),
-   isValidNumEdges(true),
-   isValidNumVertices(true) {
+   start(addSpecialVertex(*this, NODE_START)),
+   startDs(addSpecialVertex(*this, NODE_START_DOTSTAR)),
+   accept(addSpecialVertex(*this, NODE_ACCEPT)),
+   acceptEod(addSpecialVertex(*this, NODE_ACCEPT_EOD)) {
 
     // wire up some fake edges for the stylized bits of the NFA
     add_edge(start, startDs, *this);
     add_edge(startDs, startDs, *this);
     add_edge(accept, acceptEod, *this);
 
-    g[start].char_reach.setall();
-    g[startDs].char_reach.setall();
+    (*this)[start].char_reach.setall();
+    (*this)[startDs].char_reach.setall();
 }
 
 NGHolder::~NGHolder(void) {
     DEBUG_PRINTF("destroying holder @ %p\n", this);
 }
 
-size_t num_edges(NGHolder &h) {
-    if (!h.isValidNumEdges) {
-        h.numEdges = num_edges(h.g);
-        h.isValidNumEdges = true;
-    }
-    return h.numEdges;
-}
-
-size_t num_edges(const NGHolder &h) {
-    if (!h.isValidNumEdges) {
-        return num_edges(h.g);
-    }
-    return h.numEdges;
-}
-
-size_t num_vertices(NGHolder &h) {
-    if (!h.isValidNumVertices) {
-        h.numVertices = num_vertices(h.g);
-        h.isValidNumVertices = true;
-    }
-    return h.numVertices;
-}
-
-size_t num_vertices(const NGHolder &h) {
-    if (!h.isValidNumVertices) {
-        return num_vertices(h.g);
-    }
-    return h.numVertices;
-}
-
-void remove_edge(const NFAEdge &e, NGHolder &h) {
-    remove_edge(e, h.g);
-    assert(!h.isValidNumEdges || h.numEdges > 0);
-    h.numEdges--;
-}
-
-void remove_edge(NFAVertex u, NFAVertex v, NGHolder &h) {
-    remove_edge(u, v, h.g);
-    assert(!h.isValidNumEdges || h.numEdges > 0);
-    h.numEdges--;
-}
-
-void remove_vertex(NFAVertex v, NGHolder &h) {
-    remove_vertex(v, h.g);
-    assert(!h.isValidNumVertices || h.numVertices > 0);
-    h.numVertices--;
-}
-
-void clear_vertex(NFAVertex v, NGHolder &h) {
-    h.isValidNumEdges = false;
-    clear_vertex_faster(v, h.g);
-}
-
-void clear_in_edges(NFAVertex v, NGHolder &h) {
-    h.isValidNumEdges = false;
-    clear_in_edges(v, h.g);
-}
-
-void clear_out_edges(NFAVertex v, NGHolder &h) {
-    h.isValidNumEdges = false;
-    clear_out_edges(v, h.g);
-}
-
 void clear_graph(NGHolder &h) {
     NGHolder::vertex_iterator vi, ve;
     for (tie(vi, ve) = vertices(h); vi != ve;) {
@@ -166,6 +76,8 @@ void clear_graph(NGHolder &h) {
     }
 
     assert(num_vertices(h) == N_SPECIALS);
+    renumber_vertices(h); /* ensure that we reset our next allocated index */
+    renumber_edges(h);
 
     // Recreate special stylised edges.
     add_edge(h.start, h.startDs, h);
@@ -173,57 +85,13 @@ void clear_graph(NGHolder &h) {
     add_edge(h.accept, h.acceptEod, h);
 }
 
-std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v, NGHolder &h) {
-    assert(edge(u, v, h.g).second == false);
-    pair<NFAEdge, bool> e = add_edge(u, v, h.g);
-    h.g[e.first].index = h.numEdges++;
-    assert(!h.isValidNumEdges || h.numEdges > 0); // no wrapping
-    h.g[e.first].top = 0;
-    return e;
-}
-
-std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
-                                  const NFAGraph::edge_property_type &ep,
-                                  NGHolder &h) {
-    assert(edge(u, v, h.g).second == false);
-    pair<NFAEdge, bool> e = add_edge(u, v, ep, h.g);
-    h.g[e.first].index = h.numEdges++;
-    assert(!h.isValidNumEdges || h.numEdges > 0); // no wrapping
-    return e;
-}
-
-NFAVertex add_vertex(NGHolder &h) {
-    NFAVertex v = add_vertex(h.g);
-    h[v].index = h.numVertices++;
-    assert(h.numVertices > 0); // no wrapping
-    return v;
-}
-
-NFAVertex add_vertex(const NFAGraph::vertex_property_type &vp, NGHolder &h) {
-    NFAVertex v = add_vertex(h);
-    u32 i = h.g[v].index; /* preserve index */
-    h.g[v] = vp;
-    h.g[v].index = i;
-    return v;
-}
-
-void NGHolder::renumberEdges() {
-    numEdges = renumberGraphEdges(g);
-    isValidNumEdges = true;
-}
-
-void NGHolder::renumberVertices() {
-    numVertices = renumberGraphVertices(g);
-    isValidNumVertices = true;
-}
-
 NFAVertex NGHolder::getSpecialVertex(u32 id) const {
     switch (id) {
-        case NODE_START:         return start;
-        case NODE_START_DOTSTAR: return startDs;
-        case NODE_ACCEPT:        return accept;
-        case NODE_ACCEPT_EOD:    return acceptEod;
-        default:                 return nullptr;
+    case NODE_START:         return start;
+    case NODE_START_DOTSTAR: return startDs;
+    case NODE_ACCEPT:        return accept;
+    case NODE_ACCEPT_EOD:    return acceptEod;
+    default:                 return null_vertex();
     }
 }
 
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index f0a387d0..fbb6ac52 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -26,19 +26,75 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/** \file
+ * \brief Definition of the NGHolder type used for to represent general nfa
+ * graphs as well as all associated types (vertex and edge properties, etc).
+ *
+ * The NGHolder also contains the special vertices used to represents starts and
+ * accepts.
+ */
+
 #ifndef NG_HOLDER_H
 #define NG_HOLDER_H
 
-#include "ng_graph.h"
 #include "ue2common.h"
 #include "nfa/nfa_kind.h"
-
-#include <boost/graph/adjacency_iterator.hpp>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/graph_traits.hpp>
+#include "util/charreach.h"
+#include "util/ue2_containers.h"
+#include "util/ue2_graph.h"
 
 namespace ue2 {
 
+/** \brief Properties associated with each vertex in an NFAGraph. */
+struct NFAGraphVertexProps {
+    /** \brief Set of characters on which this vertex is reachable. */
+    CharReach char_reach;
+
+    /** \brief Set of reports raised by this vertex. */
+    flat_set<ReportID> reports;
+
+    /** \brief Unique index for this vertex, used for BGL algorithms. */
+    size_t index = 0;
+
+    /** \brief Flags associated with assertions. */
+    u32 assert_flags = 0;
+};
+
+/** \brief Properties associated with each edge in an NFAGraph. */
+struct NFAGraphEdgeProps {
+    /** \brief Unique index for this edge, used for BGL algorithms. */
+    size_t index = 0;
+
+    /** \brief For graphs that will be implemented as multi-top engines, this
+     * specifies the top events. Only used on edges from the start vertex. */
+    ue2::flat_set<u32> tops;
+
+    /** \brief Flags associated with assertions. */
+    u32 assert_flags = 0;
+};
+
+/** \brief vertex_index values for special nodes in the NFAGraph. */
+enum SpecialNodes {
+    /** \brief Anchored start vertex. WARNING: this may be triggered at various
+     * locations (not just zero) for triggered graphs. */
+    NODE_START,
+
+    /** \brief Unanchored start-dotstar vertex. WARNING: this may not have a
+     * proper self-loop. */
+    NODE_START_DOTSTAR,
+
+    /** \brief Accept vertex. All vertices that can match at arbitrary offsets
+     * must have an edge to this vertex. */
+    NODE_ACCEPT,
+
+    /** \brief Accept-EOD vertex. Vertices that must raise a match at EOD only
+     * must have an edge to this vertex. */
+    NODE_ACCEPT_EOD,
+
+    /** \brief Sentinel, number of special vertices. */
+    N_SPECIALS
+};
+
 /** \brief Encapsulates an NFAGraph, stores special vertices and other
  * metadata.
  *
@@ -49,188 +105,34 @@ namespace ue2 {
  * - (startDs, startDs) (self-loop)
  * - (accept, acceptEod)
  */
-class NGHolder : boost::noncopyable {
+class NGHolder : public ue2_graph<NGHolder, NFAGraphVertexProps,
+                                  NFAGraphEdgeProps> {
 public:
-    NGHolder(void);
     explicit NGHolder(nfa_kind kind);
+    NGHolder(void) : NGHolder(NFA_OUTFIX) {};
     virtual ~NGHolder(void);
 
-    // Pack edge and vertex indices.
-    // Note: maintaining edge index order can be expensive due to the frequency
-    // of edge removal/addition, so only renumberEdges() when required by
-    // operations on edge lists.
-    void renumberEdges();
-    void renumberVertices();
+    nfa_kind kind; /* Role that this plays in Rose */
 
-    NFAVertex getSpecialVertex(u32 id) const;
+    static const size_t N_SPECIAL_VERTICES = N_SPECIALS;
+public:
+    const vertex_descriptor start;     //!< Anchored start vertex.
+    const vertex_descriptor startDs;   //!< Unanchored start-dotstar vertex.
+    const vertex_descriptor accept;    //!< Accept vertex.
+    const vertex_descriptor acceptEod; //!< Accept at EOD vertex.
 
-    nfa_kind kind = NFA_OUTFIX; /* Role that this plays in Rose */
-
-    /** \brief Underlying graph object */
-    NFAGraph g;
-
-    const NFAVertex start;     //!< Anchored start vertex.
-    const NFAVertex startDs;   //!< Unanchored start-dotstar vertex.
-    const NFAVertex accept;    //!< Accept vertex.
-    const NFAVertex acceptEod; //!< Accept at EOD vertex.
-
-    using directed_category = NFAGraph::directed_category;
-    using edge_parallel_category = NFAGraph::edge_parallel_category;
-    using traversal_category = NFAGraph::traversal_category;
-
-    using vertex_descriptor = NFAGraph::vertex_descriptor;
-    using edge_descriptor = NFAGraph::edge_descriptor;
-    using adjacency_iterator = NFAGraph::adjacency_iterator;
-    using edge_iterator = NFAGraph::edge_iterator;
-    using in_edge_iterator = NFAGraph::in_edge_iterator;
-    using inv_adjacency_iterator = NFAGraph::inv_adjacency_iterator;
-    using out_edge_iterator = NFAGraph::out_edge_iterator;
-    using vertex_iterator = NFAGraph::vertex_iterator;
-    using edge_property_type = NFAGraph::edge_property_type;
-    using vertex_property_type = NFAGraph::vertex_property_type;
-
-    // These free functions, which follow the BGL model, are the interface to
-    // the graph held by this class.
-    friend size_t num_vertices(NGHolder &h);
-    friend size_t num_vertices(const NGHolder &h);
-    friend size_t num_edges(NGHolder &h);
-    friend size_t num_edges(const NGHolder &h);
-    friend void remove_vertex(NFAVertex v, NGHolder &h);
-    friend void clear_vertex(NFAVertex v, NGHolder &h);
-    friend void clear_in_edges(NFAVertex v, NGHolder &h);
-    friend void clear_out_edges(NFAVertex v, NGHolder &h);
-    friend void remove_edge(const NFAEdge &e, NGHolder &h);
-    friend void remove_edge(NFAVertex u, NFAVertex v, NGHolder &h);
-
-    template<class Predicate>
-    friend void remove_out_edge_if(NFAVertex v, Predicate pred, NGHolder &h) {
-        boost::remove_out_edge_if(v, pred, h.g);
-        h.isValidNumEdges = false;
-    }
-
-    template<class Predicate>
-    friend void remove_in_edge_if(NFAVertex v, Predicate pred, NGHolder &h) {
-        boost::remove_in_edge_if(v, pred, h.g);
-        h.isValidNumEdges = false;
-    }
-
-    template<class Predicate>
-    friend void remove_edge_if(Predicate pred, NGHolder &h) {
-        boost::remove_edge_if(pred, h.g);
-        h.isValidNumEdges = false;
-    }
-
-    friend std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
-                                             NGHolder &h);
-    friend std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
-                                             const edge_property_type &ep,
-                                             NGHolder &h);
-    friend NFAVertex add_vertex(NGHolder &h);
-    friend NFAVertex add_vertex(const vertex_property_type &vp, NGHolder &h);
-
-    static NFAVertex null_vertex(void) { return NFAGraph::null_vertex(); }
-
-    // Subscript operators for BGL bundled properties.
-    using graph_bundled = NFAGraph::graph_bundled;
-    using vertex_bundled = NFAGraph::vertex_bundled;
-    using edge_bundled = NFAGraph::edge_bundled;
-
-    vertex_bundled &operator[](NFAVertex v) {
-        return get(boost::vertex_bundle, g)[v];
-    }
-    const vertex_bundled &operator[](NFAVertex v) const {
-        return get(boost::vertex_bundle, g)[v];
-    }
-    edge_bundled &operator[](const NFAEdge &e) {
-        return get(boost::edge_bundle, g)[e];
-    }
-    const edge_bundled &operator[](const NFAEdge &e) const {
-        return get(boost::edge_bundle, g)[e];
-    }
-
-protected:
-
-    /* Since the NFAGraph vertex/edge list selectors are std::lists, computing
-     * num_vertices and num_edges is O(N). We use these members to store a
-     * cached copy of the size.
-     *
-     * In the future, with C++11's constant-time std::list::size, these may
-     * become obsolete. */
-
-    u32 numVertices;
-    u32 numEdges;
-    bool isValidNumEdges;
-    bool isValidNumVertices;
+    vertex_descriptor getSpecialVertex(u32 id) const;
 };
 
+typedef NGHolder::vertex_descriptor NFAVertex;
+typedef NGHolder::edge_descriptor NFAEdge;
+
 /** \brief True if the vertex \p v is one of our special vertices. */
 template <typename GraphT>
-static really_inline
-bool is_special(const NFAVertex v, const GraphT &g) {
+bool is_special(const typename GraphT::vertex_descriptor v, const GraphT &g) {
     return g[v].index < N_SPECIALS;
 }
 
-static really_inline
-std::pair<NGHolder::adjacency_iterator, NGHolder::adjacency_iterator>
-adjacent_vertices(NFAVertex v, const NGHolder &h) {
-    return adjacent_vertices(v, h.g);
-}
-
-static really_inline
-std::pair<NFAEdge, bool> edge(NFAVertex u, NFAVertex v, const NGHolder &h) {
-    return boost::edge(u, v, h.g);
-}
-
-static really_inline
-std::pair<NGHolder::edge_iterator, NGHolder::edge_iterator>
-edges(const NGHolder &h) {
-    return edges(h.g);
-}
-
-static really_inline
-size_t in_degree(NFAVertex v, const NGHolder &h) {
-    return in_degree(v, h.g);
-}
-
-static really_inline
-std::pair<NGHolder::in_edge_iterator, NGHolder::in_edge_iterator>
-in_edges(NFAVertex v, const NGHolder &h) {
-    return in_edges(v, h.g);
-}
-
-static really_inline
-std::pair<NGHolder::inv_adjacency_iterator, NGHolder::inv_adjacency_iterator>
-inv_adjacent_vertices(NFAVertex v, const NGHolder &h) {
-    return inv_adjacent_vertices(v, h.g);
-}
-
-static really_inline
-size_t out_degree(NFAVertex v, const NGHolder &h) {
-    return out_degree(v, h.g);
-}
-
-static really_inline
-std::pair<NGHolder::out_edge_iterator, NGHolder::out_edge_iterator>
-out_edges(NFAVertex v, const NGHolder &h) {
-    return out_edges(v, h.g);
-}
-
-static really_inline
-NFAVertex source(const NFAEdge &e, const NGHolder &h) {
-    return source(e, h.g);
-}
-
-static really_inline
-NFAVertex target(const NFAEdge &e, const NGHolder &h) {
-    return target(e, h.g);
-}
-
-static really_inline
-std::pair<NGHolder::vertex_iterator, NGHolder::vertex_iterator>
-vertices(const NGHolder &h) {
-    return vertices(h.g);
-}
-
 /**
  * \brief Clears all non-special vertices and edges from the graph.
  *
@@ -239,16 +141,6 @@ vertices(const NGHolder &h) {
  */
 void clear_graph(NGHolder &h);
 
-inline
-void renumber_edges(NGHolder &h) {
-    h.renumberEdges();
-}
-
-inline
-void renumber_vertices(NGHolder &h) {
-    h.renumberVertices();
-}
-
 /*
  * \brief Clear and remove all of the vertices pointed to by the given iterator
  * range.
@@ -275,8 +167,8 @@ void remove_vertices(Iter begin, Iter end, NGHolder &h, bool renumber = true) {
     }
 
     if (renumber) {
-        h.renumberEdges();
-        h.renumberVertices();
+        renumber_edges(h);
+        renumber_vertices(h);
     }
 }
 
@@ -311,10 +203,12 @@ void remove_edges(Iter begin, Iter end, NGHolder &h, bool renumber = true) {
     }
 
     if (renumber) {
-        h.renumberEdges();
+        renumber_edges(h);
     }
 }
 
+#define DEFAULT_TOP 0U
+
 /** \brief Clear and remove all of the edges pointed to by the edge descriptors
  * in the given container.
  *
diff --git a/src/nfagraph/ng_is_equal.cpp b/src/nfagraph/ng_is_equal.cpp
index cc65fa17..2df79f50 100644
--- a/src/nfagraph/ng_is_equal.cpp
+++ b/src/nfagraph/ng_is_equal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -77,6 +77,26 @@ private:
     ReportID a_rep;
     ReportID b_rep;
 };
+
+/** Comparison functor used to sort by vertex_index. */
+template<typename Graph>
+struct VertexIndexOrdering {
+    explicit VertexIndexOrdering(const Graph &g_in) : g(g_in) {}
+    bool operator()(typename Graph::vertex_descriptor a,
+                    typename Graph::vertex_descriptor b) const {
+        assert(a == b || g[a].index != g[b].index);
+        return g[a].index < g[b].index;
+    }
+private:
+    const Graph &g;
+};
+
+template<typename Graph>
+static
+VertexIndexOrdering<Graph> make_index_ordering(const Graph &g) {
+    return VertexIndexOrdering<Graph>(g);
+}
+
 }
 
 static
@@ -109,7 +129,7 @@ bool is_equal_i(const NGHolder &a, const NGHolder &b,
     for (size_t i = 0; i < vert_a.size(); i++) {
         NFAVertex va = vert_a[i];
         NFAVertex vb = vert_b[i];
-        DEBUG_PRINTF("vertex %u\n", a[va].index);
+        DEBUG_PRINTF("vertex %zu\n", a[va].index);
 
         // Vertex index must be the same.
         if (a[va].index != b[vb].index) {
@@ -153,14 +173,14 @@ bool is_equal_i(const NGHolder &a, const NGHolder &b,
     }
 
     /* check top for edges out of start */
-    vector<pair<u32, u32>> top_a;
-    vector<pair<u32, u32>> top_b;
+    vector<pair<u32, flat_set<u32>>> top_a;
+    vector<pair<u32, flat_set<u32>>> top_b;
 
     for (const auto &e : out_edges_range(a.start, a)) {
-        top_a.emplace_back(a[target(e, a)].index, a[e].top);
+        top_a.emplace_back(a[target(e, a)].index, a[e].tops);
     }
     for (const auto &e : out_edges_range(b.start, b)) {
-        top_b.emplace_back(b[target(e, b)].index, b[e].top);
+        top_b.emplace_back(b[target(e, b)].index, b[e].tops);
     }
 
     sort(top_a.begin(), top_a.end());
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index d7183817..d832bdaa 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -153,8 +153,7 @@ aligned_unique_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
 
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
-    aligned_unique_ptr<NFA> nfa
-        = makeLbrNfa<lbr_dot>(LBR_NFA_Dot, rtype, repeatMax);
+    auto nfa = makeLbrNfa<lbr_dot>(LBR_NFA_DOT, rtype, repeatMax);
     struct lbr_dot *ld = (struct lbr_dot *)getMutableImplNfa(nfa.get());
 
     fillNfa<lbr_dot>(nfa.get(), &ld->common, report, repeatMin, repeatMax,
@@ -177,8 +176,7 @@ aligned_unique_ptr<NFA> buildLbrVerm(const CharReach &cr,
 
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
-    aligned_unique_ptr<NFA> nfa
-        = makeLbrNfa<lbr_verm>(LBR_NFA_Verm, rtype, repeatMax);
+    auto nfa = makeLbrNfa<lbr_verm>(LBR_NFA_VERM, rtype, repeatMax);
     struct lbr_verm *lv = (struct lbr_verm *)getMutableImplNfa(nfa.get());
     lv->c = escapes.find_first();
 
@@ -202,8 +200,7 @@ aligned_unique_ptr<NFA> buildLbrNVerm(const CharReach &cr,
 
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
-    aligned_unique_ptr<NFA> nfa
-        = makeLbrNfa<lbr_verm>(LBR_NFA_NVerm, rtype, repeatMax);
+    auto nfa = makeLbrNfa<lbr_verm>(LBR_NFA_NVERM, rtype, repeatMax);
     struct lbr_verm *lv = (struct lbr_verm *)getMutableImplNfa(nfa.get());
     lv->c = escapes.find_first();
 
@@ -221,14 +218,13 @@ aligned_unique_ptr<NFA> buildLbrShuf(const CharReach &cr,
                                      bool is_reset, ReportID report) {
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
-    aligned_unique_ptr<NFA> nfa
-        = makeLbrNfa<lbr_shuf>(LBR_NFA_Shuf, rtype, repeatMax);
+    auto nfa = makeLbrNfa<lbr_shuf>(LBR_NFA_SHUF, rtype, repeatMax);
     struct lbr_shuf *ls = (struct lbr_shuf *)getMutableImplNfa(nfa.get());
 
     fillNfa<lbr_shuf>(nfa.get(), &ls->common, report, repeatMin, repeatMax,
                       minPeriod, rtype);
 
-    if (shuftiBuildMasks(~cr, &ls->mask_lo, &ls->mask_hi) == -1) {
+    if (shuftiBuildMasks(~cr, (u8 *)&ls->mask_lo, (u8 *)&ls->mask_hi) == -1) {
         return nullptr;
     }
 
@@ -243,14 +239,13 @@ aligned_unique_ptr<NFA> buildLbrTruf(const CharReach &cr,
                                      bool is_reset, ReportID report) {
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
-    aligned_unique_ptr<NFA> nfa
-        = makeLbrNfa<lbr_truf>(LBR_NFA_Truf, rtype, repeatMax);
+    auto nfa = makeLbrNfa<lbr_truf>(LBR_NFA_TRUF, rtype, repeatMax);
     struct lbr_truf *lc = (struct lbr_truf *)getMutableImplNfa(nfa.get());
 
     fillNfa<lbr_truf>(nfa.get(), &lc->common, report, repeatMin, repeatMax,
                       minPeriod, rtype);
 
-    truffleBuildMasks(~cr, &lc->mask1, &lc->mask2);
+    truffleBuildMasks(~cr, (u8 *)&lc->mask1, (u8 *)&lc->mask2);
 
     DEBUG_PRINTF("built truffle lbr\n");
     return nfa;
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 72efa43a..e92790b9 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -54,10 +54,15 @@
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
 
+#include <algorithm>
 #include <map>
 #include <vector>
 
+#include <boost/range/adaptor/map.hpp>
+
 using namespace std;
+using boost::adaptors::map_values;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
@@ -73,8 +78,7 @@ bool sanityCheckGraph(const NGHolder &g,
         // Non-specials should have non-empty reachability.
         if (!is_special(v, g)) {
             if (g[v].char_reach.none()) {
-                DEBUG_PRINTF("vertex %u has empty reach\n",
-                             g[v].index);
+                DEBUG_PRINTF("vertex %zu has empty reach\n", g[v].index);
                 return false;
             }
         }
@@ -83,25 +87,23 @@ bool sanityCheckGraph(const NGHolder &g,
         // other vertices must not have them.
         if (is_match_vertex(v, g) && v != g.accept) {
             if (g[v].reports.empty()) {
-                DEBUG_PRINTF("vertex %u has no reports\n", g[v].index);
+                DEBUG_PRINTF("vertex %zu has no reports\n", g[v].index);
                 return false;
             }
         } else if (!g[v].reports.empty()) {
-            DEBUG_PRINTF("vertex %u has reports but no accept edge\n",
+            DEBUG_PRINTF("vertex %zu has reports but no accept edge\n",
                          g[v].index);
             return false;
         }
 
         // Participant vertices should have distinct state indices.
         if (!contains(state_ids, v)) {
-            DEBUG_PRINTF("vertex %u has no state index!\n",
-                         g[v].index);
+            DEBUG_PRINTF("vertex %zu has no state index!\n", g[v].index);
             return false;
         }
         u32 s = state_ids.at(v);
         if (s != NO_STATE && !seen_states.insert(s).second) {
-            DEBUG_PRINTF("vertex %u has dupe state %u\n",
-                         g[v].index, s);
+            DEBUG_PRINTF("vertex %zu has dupe state %u\n", g[v].index, s);
             return false;
         }
     }
@@ -118,9 +120,11 @@ void findSquashStates(const NGHolder &g,
     filterSquashers(g, squashMap);
 
     /* We also filter out the cyclic states representing bounded repeats, as
-     * they are not really cyclic. */
+     * they are not really cyclic -- they may turn off unexpectedly. */
     for (const auto &br : repeats) {
-        squashMap.erase(br.cyclic);
+        if (br.repeatMax.is_finite()) {
+            squashMap.erase(br.cyclic);
+        }
     }
 }
 
@@ -144,76 +148,319 @@ void dropRedundantStartEdges(NGHolder &g) {
 }
 
 static
-void makeTopStates(NGHolder &g, map<u32, NFAVertex> &tops,
-                   const map<u32, CharReach> &top_reach) {
-    map<u32, vector<NFAVertex>> top_succs;
-    for (const auto &e : out_edges_range(g.start, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        u32 t = g[e].top;
-        top_succs[t].push_back(v);
-    }
-
-    for (const auto &top : top_succs) {
-        u32 t = top.first;
-
-        CharReach top_cr;
+CharReach calcTopVertexReach(const flat_set<u32> &tops,
+                             const map<u32, CharReach> &top_reach) {
+    CharReach top_cr;
+    for (u32 t : tops) {
         if (contains(top_reach, t)) {
-            top_cr = top_reach.at(t);
+            top_cr |= top_reach.at(t);
         } else {
             top_cr = CharReach::dot();
-        }
-
-        assert(!contains(tops, t));
-
-        NFAVertex s = NGHolder::null_vertex();
-        flat_set<NFAVertex> succs;
-        insert(&succs, top.second);
-
-        for (auto v : top.second) {
-            if (!top_cr.isSubsetOf(g[v].char_reach)) {
-                continue;
-            }
-
-            flat_set<NFAVertex> vsuccs;
-            insert(&vsuccs, adjacent_vertices(v, g));
-
-            if (succs != vsuccs) {
-                continue;
-            }
-
-            if (g[v].reports != g[g.start].reports) {
-                continue;
-            }
-            s = v;
             break;
         }
+    }
+    return top_cr;
+}
 
-        if (!s) {
-            s = add_vertex(g[g.start], g);
-            g[s].char_reach = top_cr;
-            for (auto v : top.second) {
-                add_edge(s, v, g);
+static
+NFAVertex makeTopStartVertex(NGHolder &g, const flat_set<u32> &tops,
+                             const flat_set<NFAVertex> &succs,
+                             const map<u32, CharReach> &top_reach) {
+    assert(!succs.empty());
+    assert(!tops.empty());
+
+    bool reporter = false;
+
+    NFAVertex u = add_vertex(g[g.start], g);
+    CharReach top_cr = calcTopVertexReach(tops, top_reach);
+    g[u].char_reach = top_cr;
+
+    for (auto v : succs) {
+        if (v == g.accept || v == g.acceptEod) {
+            reporter = true;
+        }
+        add_edge(u, v, g);
+    }
+
+    // Only retain reports (which we copied on add_vertex above) for new top
+    // vertices connected to accepts.
+    if (!reporter) {
+        g[u].reports.clear();
+    }
+
+    return u;
+}
+
+static
+void pickNextTopStateToHandle(const map<u32, flat_set<NFAVertex>> &top_succs,
+                              const map<NFAVertex, flat_set<u32>> &succ_tops,
+                              flat_set<u32> *picked_tops,
+                              flat_set<NFAVertex> *picked_succs) {
+    /* pick top or vertex we want to handle */
+    if (top_succs.size() < succ_tops.size()) {
+        auto best = top_succs.end();
+        for (auto it = top_succs.begin(); it != top_succs.end(); ++it) {
+            if (best == top_succs.end()
+                || it->second.size() < best->second.size()) {
+                best = it;
             }
         }
-        tops[t] = s;
+        assert(best != top_succs.end());
+        assert(!best->second.empty()); /* should already been pruned */
+
+        *picked_tops = { best->first };
+        *picked_succs = best->second;
+    } else {
+        auto best = succ_tops.end();
+        for (auto it = succ_tops.begin(); it != succ_tops.end(); ++it) {
+            /* have to worry about determinism for this one */
+            if (best == succ_tops.end()
+                || it->second.size() < best->second.size()
+                || (it->second.size() == best->second.size()
+                    && it->second < best->second)) {
+                best = it;
+            }
+        }
+        assert(best != succ_tops.end());
+        assert(!best->second.empty()); /* should already been pruned */
+
+        *picked_succs = { best->first };
+        *picked_tops = best->second;
     }
+}
+
+static
+void expandCbsByTops(const map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                     const map<u32, flat_set<NFAVertex>> &top_succs,
+                     const map<NFAVertex, flat_set<u32>> &succ_tops,
+                     flat_set<u32> &picked_tops,
+                     flat_set<NFAVertex> &picked_succs) {
+    NFAVertex v = *picked_succs.begin(); /* arbitrary successor - all equiv */
+    const auto &cand_tops = succ_tops.at(v);
+
+    for (u32 t : cand_tops) {
+        if (!contains(unhandled_top_succs, t)) {
+            continue;
+        }
+        if (!has_intersection(unhandled_top_succs.at(t), picked_succs)) {
+            continue; /* not adding any useful work that hasn't already been
+                       * done */
+        }
+        if (!is_subset_of(picked_succs, top_succs.at(t))) {
+            continue; /* will not form a cbs */
+        }
+        picked_tops.insert(t);
+    }
+}
+
+static
+void expandCbsBySuccs(const map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                      const map<u32, flat_set<NFAVertex>> &top_succs,
+                      const map<NFAVertex, flat_set<u32>> &succ_tops,
+                      flat_set<u32> &picked_tops,
+                      flat_set<NFAVertex> &picked_succs) {
+    u32 t = *picked_tops.begin(); /* arbitrary top - all equiv */
+    const auto &cand_succs = top_succs.at(t);
+
+    for (NFAVertex v : cand_succs) {
+        if (!contains(unhandled_succ_tops, v)) {
+            continue;
+        }
+        if (!has_intersection(unhandled_succ_tops.at(v), picked_tops)) {
+            continue; /* not adding any useful work that hasn't already been
+                       * done */
+        }
+        if (!is_subset_of(picked_tops, succ_tops.at(v))) {
+            continue; /* will not form a cbs */
+        }
+        picked_succs.insert(v);
+    }
+}
+
+/* See if we can expand the complete bipartite subgraph (cbs) specified by the
+ * picked tops/succs by adding more to either of the tops or succs.
+ */
+static
+void expandTopSuccCbs(const map<u32, flat_set<NFAVertex>> &top_succs,
+                      const map<NFAVertex, flat_set<u32>> &succ_tops,
+                      const map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                      const map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                      flat_set<u32> &picked_tops,
+                      flat_set<NFAVertex> &picked_succs) {
+    /* Note: all picked (tops|succs) are equivalent */
+
+    /* Try to expand first (as we are more likely to succeed) on the side
+     * with fewest remaining things to be handled */
+
+    if (unhandled_top_succs.size() < unhandled_succ_tops.size()) {
+        expandCbsByTops(unhandled_top_succs, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+        expandCbsBySuccs(unhandled_succ_tops, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+    } else {
+        expandCbsBySuccs(unhandled_succ_tops, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+        expandCbsByTops(unhandled_top_succs, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+    }
+}
+
+static
+void markTopSuccAsHandled(NFAVertex start_v,
+                          const flat_set<u32> &handled_tops,
+                          const flat_set<NFAVertex> &handled_succs,
+                          map<u32, set<NFAVertex>> &tops_out,
+                          map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                          map<NFAVertex, flat_set<u32>> &unhandled_succ_tops) {
+    for (u32 t : handled_tops) {
+        tops_out[t].insert(start_v);
+        assert(contains(unhandled_top_succs, t));
+        erase_all(&unhandled_top_succs[t], handled_succs);
+        if (unhandled_top_succs[t].empty()) {
+            unhandled_top_succs.erase(t);
+        }
+    }
+
+    for (NFAVertex v : handled_succs) {
+        assert(contains(unhandled_succ_tops, v));
+        erase_all(&unhandled_succ_tops[v], handled_tops);
+        if (unhandled_succ_tops[v].empty()) {
+            unhandled_succ_tops.erase(v);
+        }
+    }
+}
+
+static
+void attemptToUseAsStart(const NGHolder &g,  NFAVertex u,
+                         const map<u32, CharReach> &top_reach,
+                         map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                         map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                         map<u32, set<NFAVertex>> &tops_out) {
+    flat_set<u32> top_inter = unhandled_succ_tops.at(u);
+    flat_set<NFAVertex> succs;
+    for (NFAVertex v : adjacent_vertices_range(u, g)) {
+        if (!contains(unhandled_succ_tops, v)) {
+            return;
+        }
+        /* if it has vacuous reports we need to make sure that the report sets
+         * are the same */
+        if ((v == g.accept || v == g.acceptEod)
+            && g[g.start].reports != g[u].reports) {
+            DEBUG_PRINTF("different report behaviour\n");
+            return;
+        }
+        const flat_set<u32> &v_tops = unhandled_succ_tops.at(v);
+        flat_set<u32> new_inter;
+        auto ni_inserter = inserter(new_inter, new_inter.end());
+        set_intersection(top_inter.begin(), top_inter.end(),
+                         v_tops.begin(), v_tops.end(), ni_inserter);
+        top_inter = move(new_inter);
+        succs.insert(v);
+    }
+
+    if (top_inter.empty()) {
+        return;
+    }
+
+    auto top_cr = calcTopVertexReach(top_inter, top_reach);
+    if (!top_cr.isSubsetOf(g[u].char_reach)) {
+        return;
+    }
+
+    DEBUG_PRINTF("reusing %zu is a start vertex\n", g[u].index);
+    markTopSuccAsHandled(u, top_inter, succs, tops_out, unhandled_top_succs,
+                         unhandled_succ_tops);
+}
+
+/* We may have cases where a top triggers something that starts with a .* (or
+ * similar state). In these cases we can make use of that state as a start
+ * state.
+ */
+static
+void reusePredsAsStarts(const NGHolder &g, const map<u32, CharReach> &top_reach,
+                        map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                        map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                        map<u32, set<NFAVertex>> &tops_out) {
+    /* create list of candidates first, to avoid issues of iter invalidation */
+    DEBUG_PRINTF("attempting to reuse vertices for top starts\n");
+    vector<NFAVertex> cand_starts;
+    for (NFAVertex u : unhandled_succ_tops | map_keys) {
+        if (hasSelfLoop(u, g)) {
+            cand_starts.push_back(u);
+        }
+    }
+
+    for (NFAVertex u : cand_starts) {
+        if (!contains(unhandled_succ_tops, u)) {
+            continue;
+        }
+        attemptToUseAsStart(g, u, top_reach, unhandled_top_succs,
+                            unhandled_succ_tops, tops_out);
+     }
+}
+
+static
+void makeTopStates(NGHolder &g, map<u32, set<NFAVertex>> &tops_out,
+                   const map<u32, CharReach> &top_reach) {
+    /* Ideally, we want to add the smallest number of states to the graph for
+     * tops to turn on so that they can accurately trigger their successors.
+     *
+     * The relationships between tops and their successors forms a bipartite
+     * graph. Finding the optimal number of start states to add is equivalent to
+     * finding a minimal biclique coverings. Unfortunately, this is known to be
+     * NP-complete.
+     *
+     * Given this, we will just do something simple to avoid creating something
+     * truly wasteful:
+     * 1) Try to find any cyclic states which can act as their own start states
+     * 2) Pick a top or a succ to create a start state for and then try to find
+     *    the largest complete bipartite subgraph that it is part of.
+     */
+
+    map<u32, flat_set<NFAVertex>> top_succs;
+    map<NFAVertex, flat_set<u32>> succ_tops;
+    for (const auto &e : out_edges_range(g.start, g)) {
+        NFAVertex v = target(e, g);
+        for (u32 t : g[e].tops) {
+            top_succs[t].insert(v);
+            succ_tops[v].insert(t);
+        }
+    }
+
+    auto unhandled_top_succs = top_succs;
+    auto unhandled_succ_tops = succ_tops;
+
+    reusePredsAsStarts(g, top_reach, unhandled_top_succs, unhandled_succ_tops,
+                       tops_out);
+
+    /* Note: there may be successors which are equivalent (in terms of
+       top-triggering), it may be more efficient to discover this and treat them
+       as a unit. TODO */
+
+    while (!unhandled_succ_tops.empty()) {
+        assert(!unhandled_top_succs.empty());
+        DEBUG_PRINTF("creating top start vertex\n");
+        flat_set<u32> u_tops;
+        flat_set<NFAVertex> u_succs;
+        pickNextTopStateToHandle(unhandled_top_succs, unhandled_succ_tops,
+                                 &u_tops, &u_succs);
+
+        expandTopSuccCbs(top_succs, succ_tops, unhandled_top_succs,
+                         unhandled_succ_tops, u_tops, u_succs);
+
+        /* create start vertex to handle this top/succ combination */
+        NFAVertex u = makeTopStartVertex(g, u_tops, u_succs, top_reach);
+
+        /* update maps */
+        markTopSuccAsHandled(u, u_tops, u_succs, tops_out, unhandled_top_succs,
+                             unhandled_succ_tops);
+    }
+    assert(unhandled_top_succs.empty());
 
     // We are completely replacing the start vertex, so clear its reports.
     clear_out_edges(g.start, g);
     add_edge(g.start, g.startDs, g);
     g[g.start].reports.clear();
-
-    // Only retain reports (which we copied on add_vertex above) for new top
-    // vertices connected to accepts.
-    for (const auto &m : tops) {
-        NFAVertex v = m.second;
-        if (!edge(v, g.accept, g).second && !edge(v, g.acceptEod, g).second) {
-            g[v].reports.clear();
-        }
-    }
 }
 
 static
@@ -232,7 +479,7 @@ set<NFAVertex> findZombies(const NGHolder &h,
     }
 
     if (in_degree(h.acceptEod, h) != 1 || all_reports(h).size() != 1) {
-        DEBUG_PRINTF("can be made undead - bad reports\n");
+        DEBUG_PRINTF("cannot be made undead - bad reports\n");
         return zombies;
     }
 
@@ -321,7 +568,8 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, vector<vector<CharReach>>> &triggers,
              bool impl_test_only, const CompileContext &cc,
              ue2::unordered_map<NFAVertex, u32> &state_ids,
-             vector<BoundedRepeatData> &repeats, map<u32, NFAVertex> &tops) {
+             vector<BoundedRepeatData> &repeats,
+             map<u32, set<NFAVertex>> &tops) {
     assert(is_triggered(h_in) || fixed_depth_tops.empty());
 
     unique_ptr<NGHolder> h = cloneHolder(h_in);
@@ -331,15 +579,19 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
                    impl_test_only, cc.grey);
 
     // If we're building a rose/suffix, do the top dance.
+    flat_set<NFAVertex> topVerts;
     if (is_triggered(*h)) {
         makeTopStates(*h, tops, findTopReach(triggers));
+
+        for (const auto &vv : tops | map_values) {
+            insert(&topVerts, vv);
+        }
     }
 
     dropRedundantStartEdges(*h);
 
     // Do state numbering
-    state_ids = numberStates(*h, tops);
-    dropUnusedStarts(*h, state_ids);
+    state_ids = numberStates(*h, topVerts);
 
     // In debugging, we sometimes like to reverse the state numbering to stress
     // the NFA construction code.
@@ -364,7 +616,7 @@ void remapReportsToPrograms(NGHolder &h, const ReportManager &rm) {
             u32 program = rm.getProgramOffset(id);
             reports.insert(program);
         }
-        DEBUG_PRINTF("vertex %u: remapped reports {%s} to programs {%s}\n",
+        DEBUG_PRINTF("vertex %zu: remapped reports {%s} to programs {%s}\n",
                      h[v].index, as_string_list(old_reports).c_str(),
                      as_string_list(reports).c_str());
     }
@@ -385,14 +637,14 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(h_in, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
 
     // Quick exit: if we've got an embarrassment of riches, i.e. more states
     // than we can implement in our largest NFA model, bail here.
-    u32 numStates = countStates(*h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates > NFA_MAX_STATES) {
         DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
         return nullptr;
@@ -465,13 +717,11 @@ aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
     assert(h.kind == NFA_REV_PREFIX); /* triggered, raises internal callbacks */
 
     // Do state numbering.
-    auto state_ids = numberStates(h);
-
-    dropUnusedStarts(h, state_ids);
+    auto state_ids = numberStates(h, {});
 
     // Quick exit: if we've got an embarrassment of riches, i.e. more states
     // than we can implement in our largest NFA model, bail here.
-    u32 numStates = countStates(h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates > NFA_MAX_STATES) {
         DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
         return nullptr;
@@ -479,7 +729,7 @@ aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
 
     assert(sanityCheckGraph(h, state_ids));
 
-    map<u32, NFAVertex> tops; /* only the standards tops for nfas */
+    map<u32, set<NFAVertex>> tops; /* only the standards tops for nfas */
     set<NFAVertex> zombies;
     vector<BoundedRepeatData> repeats;
     map<NFAVertex, NFAStateSet> reportSquashMap;
@@ -508,10 +758,13 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
     if (!cc.grey.allowLimExNFA) {
         return false;
     }
+
+    assert(!can_never_match(g));
+
     // Quick check: we can always implement an NFA with less than NFA_MAX_STATES
     // states. Note that top masks can generate extra states, so we account for
     // those here too.
-    if (num_vertices(g) + NFA_MAX_TOP_MASKS < NFA_MAX_STATES) {
+    if (num_vertices(g) + getTops(g).size() < NFA_MAX_STATES) {
         return true;
     }
 
@@ -532,12 +785,12 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
     assert(h);
-    u32 numStates = countStates(*h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates <= NFA_MAX_STATES) {
         return numStates;
     }
@@ -579,12 +832,12 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
 
-    if (!h || countStates(*h, state_ids, false) > NFA_MAX_STATES) {
+    if (!h || countStates(state_ids) > NFA_MAX_STATES) {
         DEBUG_PRINTF("not constructible\n");
         return NFA_MAX_ACCEL_STATES + 1;
     }
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index deaf2ffd..bfba7c71 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -69,7 +69,7 @@ void findAccelFriendGeneration(const NGHolder &g, const CharReach &cr,
         }
 
         const CharReach &acr = g[v].char_reach;
-        DEBUG_PRINTF("checking %u\n", g[v].index);
+        DEBUG_PRINTF("checking %zu\n", g[v].index);
 
         if (acr.count() < WIDE_FRIEND_MIN || !acr.isSubsetOf(cr)) {
             DEBUG_PRINTF("bad reach %zu\n", acr.count());
@@ -86,7 +86,7 @@ void findAccelFriendGeneration(const NGHolder &g, const CharReach &cr,
         next_preds->insert(v);
         insert(next_cands, adjacent_vertices(v, g));
 
-        DEBUG_PRINTF("%u is a friend indeed\n", g[v].index);
+        DEBUG_PRINTF("%zu is a friend indeed\n", g[v].index);
         friends->insert(v);
     next_cand:;
     }
@@ -675,7 +675,7 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
 
     while (true) {
         if (hasSelfLoop(v, g)) {
-            DEBUG_PRINTF("woot %u\n", g[v].index);
+            DEBUG_PRINTF("woot %zu\n", g[v].index);
             return v;
         }
         if (out_degree(v, g) != 1) {
@@ -837,7 +837,7 @@ bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
     CharReach terminating = g[v].char_reach;
     terminating.flip();
 
-    DEBUG_PRINTF("vertex %u is cyclic and has %zu stop chars%s\n",
+    DEBUG_PRINTF("vertex %zu is cyclic and has %zu stop chars%s\n",
                  g[v].index, terminating.count(),
                  allow_wide ? " (w)" : "");
 
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index 9229457c..a5f3468b 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -40,6 +40,7 @@
 #include "util/depth.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/ue2_graph.h"
 #include "util/ue2string.h"
 
 #include <algorithm>
@@ -49,7 +50,6 @@
 #include <boost/graph/boykov_kolmogorov_max_flow.hpp>
 
 using namespace std;
-using boost::vertex_index;
 
 namespace ue2 {
 
@@ -64,24 +64,29 @@ namespace {
 /* Small literal graph type used for the suffix tree used in
  * compressAndScore. */
 
-
 struct LitGraphVertexProps {
-    LitGraphVertexProps() {}
-    explicit LitGraphVertexProps(const ue2_literal::elem &c_in) : c(c_in) {}
+    LitGraphVertexProps() = default;
+    explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
     ue2_literal::elem c; // string element (char + bool)
+    size_t index; // managed by ue2_graph
 };
 
 struct LitGraphEdgeProps {
-    LitGraphEdgeProps() {}
+    LitGraphEdgeProps() = default;
     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
     u64a score = NO_LITERAL_AT_EDGE_SCORE;
-    size_t index; /* only initialised when the reverse edges are added. */
+    size_t index; // managed by ue2_graph
+};
+
+struct LitGraph
+    : public ue2_graph<LitGraph, LitGraphVertexProps, LitGraphEdgeProps> {
+
+    LitGraph() : root(add_vertex(*this)), sink(add_vertex(*this)) {}
+
+    const vertex_descriptor root;
+    const vertex_descriptor sink;
 };
 
-/* keep edgeList = listS as you cannot remove edges if edgeList = vecS */
-typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS,
-                              LitGraphVertexProps, LitGraphEdgeProps,
-                              boost::no_property> LitGraph;
 typedef LitGraph::vertex_descriptor LitVertex;
 typedef LitGraph::edge_descriptor LitEdge;
 
@@ -94,17 +99,16 @@ typedef std::queue<VertexPair> LitVertexQ;
 
 /** \brief Dump the literal graph in Graphviz format. */
 static UNUSED
-void dumpGraph(const char *filename, const LitGraph &lg, const LitVertex &root,
-               const LitVertex &sink) {
+void dumpGraph(const char *filename, const LitGraph &lg) {
     ofstream fout(filename);
 
     fout << "digraph G {" << endl;
 
     for (auto v : vertices_range(lg)) {
-        fout << boost::get(vertex_index, lg, v);
-        if (v == root) {
+        fout << lg[v].index;
+        if (v == lg.root) {
             fout << "[label=\"ROOT\"];";
-        } else if (v == sink) {
+        } else if (v == lg.sink) {
             fout << "[label=\"SINK\"];";
         } else {
             ue2_literal s;
@@ -116,10 +120,9 @@ void dumpGraph(const char *filename, const LitGraph &lg, const LitVertex &root,
 
     for (const auto &e : edges_range(lg)) {
         LitVertex u = source(e, lg), v = target(e, lg);
-        fout << boost::get(vertex_index, lg, u) << " -> " <<
-                boost::get(vertex_index, lg, v) <<
-                "[label=\"" << lg[e].score << "\"]" <<
-                ";" << endl;
+        fout << lg[u].index << " -> " << lg[v].index << "[label=\""
+             << lg[e].score << "\"]"
+             << ";" << endl;
     }
 
     fout << "}" << endl;
@@ -141,11 +144,11 @@ bool allowExpand(size_t numItems, size_t totalPathsSoFar) {
 }
 
 static
-LitVertex addToLitGraph(LitGraph &lg, LitVertex sink,
-                        LitVertex pred, const ue2_literal::elem &c) {
+LitVertex addToLitGraph(LitGraph &lg, LitVertex pred,
+                        const ue2_literal::elem &c) {
     // Check if we already have this in the graph.
     for (auto v : adjacent_vertices_range(pred, lg)) {
-        if (v == sink) {
+        if (v == lg.sink) {
             continue;
         }
         if (lg[v].c == c) {
@@ -159,9 +162,10 @@ LitVertex addToLitGraph(LitGraph &lg, LitVertex sink,
 }
 
 static
-void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex sink,
-                LitVertex pred, const CharReach &cr, NFAVertex v) {
-    for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
+void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex pred,
+                const CharReach &cr, NFAVertex v) {
+    for (size_t i = cr.find_first(); i != CharReach::npos;
+         i = cr.find_next(i)) {
         if (myisupper(i) && cr.test(mytolower(i))) {
             // ignore upper half of a nocase pair
             continue;
@@ -169,14 +173,14 @@ void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex sink,
 
         bool nocase = myislower(i) && cr.test(mytoupper(i));
         ue2_literal::elem c((char)i, nocase);
-        LitVertex lv = addToLitGraph(lg, sink, pred, c);
+        LitVertex lv = addToLitGraph(lg, pred, c);
         workQ.push(VertexPair(lv, v));
     }
 }
 
 static
-void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex root,
-                   LitVertex sink, const NGHolder &g, const NFAEdge &e) {
+void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, const NGHolder &g,
+                   const NFAEdge &e) {
     NFAVertex u = source(e, g);
     NFAVertex v = target(e, g);
     const CharReach &cr = g[v].char_reach;
@@ -185,7 +189,7 @@ void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex root,
         return;
     }
 
-    addToQueue(workQ, lg, sink, root, cr, u);
+    addToQueue(workQ, lg, lg.root, cr, u);
 }
 
 static
@@ -197,7 +201,8 @@ u32 crCardinality(const CharReach &cr) {
     }
 
     u32 rv = 0;
-    for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
+    for (size_t i = cr.find_first(); i != CharReach::npos;
+         i = cr.find_next(i)) {
         if (myisupper(i) && cr.test(mytolower(i))) {
             // ignore upper half of a nocase pair
             continue;
@@ -212,10 +217,10 @@ u32 crCardinality(const CharReach &cr) {
  * identifying vertices connected to the sink and removing their other
  * out-edges. */
 static
-void filterLitGraph(LitGraph &lg, const LitVertex sink) {
-    for (auto v : inv_adjacent_vertices_range(sink, lg)) {
-        remove_out_edge_if(v, [&lg, &sink](const LitEdge &e) {
-            return target(e, lg) != sink;
+void filterLitGraph(LitGraph &lg) {
+    for (auto v : inv_adjacent_vertices_range(lg.sink, lg)) {
+        remove_out_edge_if(v, [&lg](const LitEdge &e) {
+            return target(e, lg) != lg.sink;
         }, lg);
     }
 
@@ -228,13 +233,12 @@ void filterLitGraph(LitGraph &lg, const LitVertex sink) {
  * from each predecessor of the sink (note: it's a suffix tree except for this
  * convenience) towards the source, storing each string as we go. */
 static
-void extractLiterals(const LitGraph &lg, const LitVertex root,
-                     const LitVertex sink, set<ue2_literal> &s) {
+void extractLiterals(const LitGraph &lg, set<ue2_literal> &s) {
     ue2_literal lit;
 
-    for (auto u : inv_adjacent_vertices_range(sink, lg)) {
+    for (auto u : inv_adjacent_vertices_range(lg.sink, lg)) {
         lit.clear();
-        while (u != root) {
+        while (u != lg.root) {
             lit.push_back(lg[u].c);
             assert(in_degree(u, lg) <= 1);
             LitGraph::inv_adjacency_iterator ai2, ae2;
@@ -276,11 +280,9 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
     }
 
     LitGraph lg;
-    LitVertex root = add_vertex(lg);
-    LitVertex sink = add_vertex(lg);
 
     LitVertexQ workQ;
-    initWorkQueue(workQ, lg, root, sink, g, e);
+    initWorkQueue(workQ, lg, g, e);
 
     while (!workQ.empty()) {
         const LitVertex lv = workQ.front().first;
@@ -289,18 +291,18 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
 
         u32 cr_card = crCardinality(cr);
         size_t numItems = cr_card * in_degree(t, g);
-        size_t committed_count = workQ.size() + in_degree(sink, lg) - 1;
+        size_t committed_count = workQ.size() + in_degree(lg.sink, lg) - 1;
 
         if (g[t].index == NODE_START) {
             // reached start, add to literal set
-            add_edge_if_not_present(lv, sink, lg);
+            add_edge_if_not_present(lv, lg.sink, lg);
             goto next_work_elem;
         }
 
         // Expand next vertex
         if (allowExpand(numItems, committed_count)) {
             for (auto u : inv_adjacent_vertices_range(t, g)) {
-                addToQueue(workQ, lg, sink, lv, cr, u);
+                addToQueue(workQ, lg, lv, cr, u);
             }
             goto next_work_elem;
         }
@@ -316,26 +318,26 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
 
                 bool nocase = myislower(i) && cr.test(mytoupper(i));
                 ue2_literal::elem c((char)i, nocase);
-                LitVertex lt = addToLitGraph(lg, sink, lv, c);
-                add_edge_if_not_present(lt, sink, lg);
+                LitVertex lt = addToLitGraph(lg, lv, c);
+                add_edge_if_not_present(lt, lg.sink, lg);
             }
             goto next_work_elem;
         }
 
         // add to literal set
-        add_edge_if_not_present(lv, sink, lg);
+        add_edge_if_not_present(lv, lg.sink, lg);
     next_work_elem:
         workQ.pop();
     }
 
-    filterLitGraph(lg, sink);
-    //dumpGraph("litgraph.dot", lg, root, sink);
-    extractLiterals(lg, root, sink, s);
+    filterLitGraph(lg);
+    //dumpGraph("litgraph.dot", lg);
+    extractLiterals(lg, s);
 
     // Our literal set should contain no literal that is a suffix of another.
     assert(!hasSuffixLiterals(s));
 
-    DEBUG_PRINTF("edge %u (%u->%u) produced %zu literals\n", g[e].index,
+    DEBUG_PRINTF("edge %zu (%zu->%zu) produced %zu literals\n", g[e].index,
                  g[source(e, g)].index, g[target(e, g)].index, s.size());
 }
 
@@ -409,16 +411,15 @@ u64a calculateScore(const ue2_literal &s) {
 
 /** Adds a literal in reverse order, building up a suffix tree. */
 static
-void addReversedLiteral(const ue2_literal &lit, LitGraph &lg,
-                        const LitVertex &root, const LitVertex &sink) {
+void addReversedLiteral(const ue2_literal &lit, LitGraph &lg) {
     DEBUG_PRINTF("literal: '%s'\n", escapeString(lit).c_str());
     ue2_literal suffix;
-    LitVertex v = root;
+    LitVertex v = lg.root;
     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
         suffix.push_back(*it);
         LitVertex w;
         for (auto v2 : adjacent_vertices_range(v, lg)) {
-            if (v2 != sink && lg[v2].c == *it) {
+            if (v2 != lg.sink && lg[v2].c == *it) {
                 w = v2;
                 goto next_char;
             }
@@ -430,17 +431,18 @@ next_char:
     }
 
     // Wire the last vertex to the sink.
-    add_edge(v, sink, lg);
+    add_edge(v, lg.sink, lg);
 }
 
 static
 void extractLiterals(const vector<LitEdge> &cutset, const LitGraph &lg,
-                     const LitVertex &root, set<ue2_literal> &s) {
+                     set<ue2_literal> &s) {
     for (const auto &e : cutset) {
-        LitVertex u = source(e, lg), v = target(e, lg);
+        LitVertex u = source(e, lg);
+        LitVertex v = target(e, lg);
         ue2_literal lit;
         lit.push_back(lg[v].c);
-        while (u != root) {
+        while (u != lg.root) {
             lit.push_back(lg[u].c);
             assert(in_degree(u, lg) == 1);
             LitGraph::inv_adjacency_iterator ai, ae;
@@ -487,10 +489,7 @@ const char *describeColor(boost::default_color_type c) {
 static
 vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
     vector<LitEdge> fwd_edges;
-
-    size_t next_index = 0;
     for (const auto &e : edges_range(lg)) {
-        lg[e].index = next_index++;
         fwd_edges.push_back(e);
     }
 
@@ -502,9 +501,7 @@ vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
 
         assert(!edge(v, u, lg).second);
 
-        LitEdge rev = add_edge(v, u, lg).first;
-        lg[rev].score = 0;
-        lg[rev].index = next_index++;
+        LitEdge rev = add_edge(v, u, LitGraphEdgeProps(0), lg).first;
         rev_map[lg[e].index] = rev;
         rev_map[lg[rev].index] = e;
     }
@@ -513,20 +510,19 @@ vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
 }
 
 static
-void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
-                vector<LitEdge> &cutset) {
+void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
     cutset.clear();
 
-    //dumpGraph("litgraph.dot", lg, root, sink);
+    //dumpGraph("litgraph.dot", lg);
 
-    assert(!in_degree(root, lg));
-    assert(!out_degree(sink, lg));
+    assert(!in_degree(lg.root, lg));
+    assert(!out_degree(lg.sink, lg));
     size_t num_real_edges = num_edges(lg);
 
     // Add reverse edges for the convenience of the BGL's max flow algorithm.
     vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
 
-    const auto v_index_map = get(vertex_index, lg);
+    const auto v_index_map = get(&LitGraphVertexProps::index, lg);
     const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
     const size_t num_verts = num_vertices(lg);
     vector<boost::default_color_type> colors(num_verts);
@@ -541,7 +537,7 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
             make_iterator_property_map(predecessors.begin(), v_index_map),
             make_iterator_property_map(colors.begin(), v_index_map),
             make_iterator_property_map(distances.begin(), v_index_map),
-            v_index_map, root, sink);
+            v_index_map, lg.root, lg.sink);
     DEBUG_PRINTF("done, flow = %llu\n", flow);
 
     /* remove reverse edges */
@@ -554,21 +550,20 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
 
     for (const auto &e : edges_range(lg)) {
         const LitVertex u = source(e, lg), v = target(e, lg);
-        const auto ucolor = colors[boost::get(vertex_index, lg, u)];
-        const auto vcolor = colors[boost::get(vertex_index, lg, v)];
+        const auto ucolor = colors[lg[u].index];
+        const auto vcolor = colors[lg[v].index];
 
-        DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n",
-                     boost::get(vertex_index, lg, u), describeColor(ucolor),
-                     boost::get(vertex_index, lg, v), describeColor(vcolor),
+        DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n", lg[u].index,
+                     describeColor(ucolor), lg[v].index, describeColor(vcolor),
                      lg[e].score);
 
         if (ucolor != boost::white_color && vcolor == boost::white_color) {
-            assert(target(e, lg) != sink);
+            assert(v != lg.sink);
             white_cut.push_back(e);
             white_flow += lg[e].score;
         }
         if (ucolor == boost::black_color && vcolor != boost::black_color) {
-            assert(target(e, lg) != sink);
+            assert(v != lg.sink);
             black_cut.push_back(e);
             black_flow += lg[e].score;
         }
@@ -608,21 +603,19 @@ u64a compressAndScore(set<ue2_literal> &s) {
                   initialScore);
 
     LitGraph lg;
-    const LitVertex root = add_vertex(lg);
-    const LitVertex sink = add_vertex(lg);
 
     for (const auto &lit : s) {
-        addReversedLiteral(lit, lg, root, sink);
+        addReversedLiteral(lit, lg);
     }
 
     DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n",
                   num_vertices(lg), num_edges(lg));
 
     vector<LitEdge> cutset;
-    findMinCut(lg, root, sink, cutset);
+    findMinCut(lg, cutset);
 
     s.clear();
-    extractLiterals(cutset, lg, root, s);
+    extractLiterals(cutset, lg, s);
 
     u64a score = scoreSet(s);
     DEBUG_PRINTF("compressed score is %llu\n", score);
@@ -791,7 +784,7 @@ bool splitOffLeadingLiteral_i(const NGHolder &g, bool anch,
     }
 
     while (true) {
-        DEBUG_PRINTF("validating vertex %u\n", g[v].index);
+        DEBUG_PRINTF("validating vertex %zu\n", g[v].index);
 
         assert(v != g.acceptEod && v != g.accept);
 
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index 871c8ac7..e3cfe867 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -95,7 +95,7 @@ void addToString(string &s, const NGHolder &g, NFAVertex v) {
 static
 bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
                      set<NFAVertex> &dead) {
-    DEBUG_PRINTF("examine vertex %u\n", g[v].index);
+    DEBUG_PRINTF("examine vertex %zu\n", g[v].index);
     bool nocase = false, casefixed = false;
 
     assert(!is_special(v, g));
@@ -109,7 +109,7 @@ bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
         assert(edge(g.start, v, g).second);
         assert(edge(g.startDs, v, g).second);
     }
-    if (hasGreaterInDegree(reqInDegree, v, g)) {
+    if (in_degree(v, g) > reqInDegree) {
         DEBUG_PRINTF("extra in-edges\n");
         return false;
     }
@@ -134,7 +134,7 @@ bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
         u = v; // previous vertex
         v = *(adjacent_vertices(v, g).first);
 
-        DEBUG_PRINTF("loop, v=%u\n", g[v].index);
+        DEBUG_PRINTF("loop, v=%zu\n", g[v].index);
 
         if (is_special(v, g)) {
             if (v == g.accept || v == g.acceptEod) {
diff --git a/src/nfagraph/ng_literal_decorated.cpp b/src/nfagraph/ng_literal_decorated.cpp
index 02b25a73..89c01a6c 100644
--- a/src/nfagraph/ng_literal_decorated.cpp
+++ b/src/nfagraph/ng_literal_decorated.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -77,7 +77,7 @@ bool findPaths(const NGHolder &g, vector<Path> &paths) {
 
         read_count[g[v].index] = out_degree(v, g);
 
-        DEBUG_PRINTF("setting read_count to %zu for %u\n",
+        DEBUG_PRINTF("setting read_count to %zu for %zu\n",
                       read_count[g[v].index], g[v].index);
 
         if (v == g.start || v == g.startDs) {
@@ -117,7 +117,7 @@ bool findPaths(const NGHolder &g, vector<Path> &paths) {
 
             read_count[g[u].index]--;
             if (!read_count[g[u].index]) {
-                DEBUG_PRINTF("clearing %u as finished reading\n", g[u].index);
+                DEBUG_PRINTF("clearing %zu as finished reading\n", g[u].index);
                 built[g[u].index].clear();
                 built[g[u].index].shrink_to_fit();
             }
@@ -138,9 +138,9 @@ bool hasLargeDegreeVertex(const NGHolder &g) {
         if (is_special(v, g)) { // specials can have large degree
             continue;
         }
-        if (has_greater_degree(MAX_VERTEX_DEGREE, v, g)) {
-            DEBUG_PRINTF("vertex %u has degree %zu\n", g[v].index,
-                          boost::degree(v, g.g));
+        if (degree(v, g) > MAX_VERTEX_DEGREE) {
+            DEBUG_PRINTF("vertex %zu has degree %zu\n", g[v].index,
+                         degree(v, g));
             return true;
         }
     }
@@ -188,7 +188,8 @@ struct PathMask {
         }
 
         // Reports are attached to the second-to-last vertex.
-        reports = g[*next(path.rbegin())].reports;
+        NFAVertex u = *std::next(path.rbegin());
+        reports = g[u].reports;
         assert(!reports.empty());
     }
 
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 39788570..375086a4 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -36,7 +36,6 @@
 #include "nfa/rdfa.h"
 #include "ng_holder.h"
 #include "ng_mcclellan_internal.h"
-#include "ng_restructuring.h"
 #include "ng_squash.h"
 #include "ng_util.h"
 #include "ue2common.h"
@@ -329,7 +328,7 @@ void markToppableStarts(const NGHolder &g, const flat_set<NFAVertex> &unused,
         }
         for (const auto &trigger : triggers) {
             if (triggerAllowed(g, v, triggers, trigger)) {
-                DEBUG_PRINTF("idx %u is valid location for top\n", g[v].index);
+                DEBUG_PRINTF("idx %zu is valid location for top\n", g[v].index);
                 out->set(g[v].index);
                 break;
             }
@@ -348,10 +347,11 @@ public:
     using StateMap = typename Automaton_Traits::StateMap;
 
     Automaton_Base(const ReportManager *rm_in, const NGHolder &graph_in,
-                   const flat_set<NFAVertex> &unused_in, bool single_trigger,
+                   bool single_trigger,
                    const vector<vector<CharReach>> &triggers, bool prunable_in)
         : rm(rm_in), graph(graph_in), numStates(num_vertices(graph)),
-          unused(unused_in), init(Automaton_Traits::init_states(numStates)),
+          unused(getRedundantStarts(graph_in)),
+          init(Automaton_Traits::init_states(numStates)),
           initDS(Automaton_Traits::init_states(numStates)),
           squash(Automaton_Traits::init_states(numStates)),
           accept(Automaton_Traits::init_states(numStates)),
@@ -444,7 +444,7 @@ private:
 public:
     const NGHolder &graph;
     u32 numStates;
-    const flat_set<NFAVertex> &unused;
+    const flat_set<NFAVertex> unused;
     vector<NFAVertex> v_by_index;
     vector<CharReach> cr_by_index; /* pre alpha'ed */
     StateSet init;
@@ -482,9 +482,9 @@ struct Big_Traits {
 class Automaton_Big : public Automaton_Base<Big_Traits> {
 public:
     Automaton_Big(const ReportManager *rm_in, const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, bool single_trigger,
+                  bool single_trigger,
                   const vector<vector<CharReach>> &triggers, bool prunable_in)
-        : Automaton_Base(rm_in, graph_in, unused_in, single_trigger, triggers,
+        : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
                          prunable_in) {}
 };
 
@@ -510,14 +510,36 @@ struct Graph_Traits {
 class Automaton_Graph : public Automaton_Base<Graph_Traits> {
 public:
     Automaton_Graph(const ReportManager *rm_in, const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, bool single_trigger,
-                  const vector<vector<CharReach>> &triggers, bool prunable_in)
-        : Automaton_Base(rm_in, graph_in, unused_in, single_trigger, triggers,
+                    bool single_trigger,
+                    const vector<vector<CharReach>> &triggers, bool prunable_in)
+        : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
                          prunable_in) {}
 };
 
 } // namespace
 
+static
+bool startIsRedundant(const NGHolder &g) {
+    set<NFAVertex> start;
+    set<NFAVertex> startDs;
+
+    insert(&start, adjacent_vertices(g.start, g));
+    insert(&startDs, adjacent_vertices(g.startDs, g));
+
+    return start == startDs;
+}
+
+flat_set<NFAVertex> getRedundantStarts(const NGHolder &g) {
+    flat_set<NFAVertex> dead;
+    if (startIsRedundant(g)) {
+        dead.insert(g.start);
+    }
+    if (proper_out_degree(g.startDs, g) == 0) {
+        dead.insert(g.startDs);
+    }
+    return dead;
+}
+
 unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
                                    const ReportManager *rm, bool single_trigger,
                                    const vector<vector<CharReach>> &triggers,
@@ -526,8 +548,6 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         return nullptr;
     }
 
-    auto unused = findUnusedStates(graph);
-
     DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind);
     assert(allMatchStatesHaveReports(graph));
 
@@ -553,8 +573,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
     if (numStates <= NFA_STATE_LIMIT) {
         /* Fast path. Automaton_Graph uses a bitfield internally to represent
          * states and is quicker than Automaton_Big. */
-        Automaton_Graph n(rm, graph, unused, single_trigger, triggers,
-                          prunable);
+        Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
         if (determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
@@ -566,7 +585,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         rdfa->alpha_remap = n.alpha;
     } else {
         /* Slow path. Too many states to use Automaton_Graph. */
-        Automaton_Big n(rm, graph, unused, single_trigger, triggers, prunable);
+        Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
         if (determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
diff --git a/src/nfagraph/ng_mcclellan_internal.h b/src/nfagraph/ng_mcclellan_internal.h
index 22fcf01e..b78dac3b 100644
--- a/src/nfagraph/ng_mcclellan_internal.h
+++ b/src/nfagraph/ng_mcclellan_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@
 #include "ue2common.h"
 #include "nfa/mcclellancompile.h"
 #include "nfagraph/ng_holder.h"
-#include "nfagraph/ng_restructuring.h" // for NO_STATE
 #include "util/charreach.h"
 #include "util/graph_range.h"
 #include "util/ue2_containers.h"
@@ -69,6 +68,13 @@ void markToppableStarts(const NGHolder &g, const flat_set<NFAVertex> &unused,
                         const std::vector<std::vector<CharReach>> &triggers,
                         boost::dynamic_bitset<> *out);
 
+/**
+ * \brief Returns a set of start vertices that will not participate in an
+ * implementation of this graph. These are either starts with no successors or
+ * starts which are redundant with startDs.
+ */
+flat_set<NFAVertex> getRedundantStarts(const NGHolder &g);
+
 template<typename autom>
 void transition_graph(autom &nfa, const std::vector<NFAVertex> &vByStateId,
                       const typename autom::StateSet &in,
diff --git a/src/nfagraph/ng_misc_opt.cpp b/src/nfagraph/ng_misc_opt.cpp
index 2e02933a..29939fec 100644
--- a/src/nfagraph/ng_misc_opt.cpp
+++ b/src/nfagraph/ng_misc_opt.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -69,13 +69,18 @@
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/graph_range.h"
+#include "util/ue2_containers.h"
 #include "ue2common.h"
 
+#include <boost/graph/depth_first_search.hpp>
+#include <boost/graph/filtered_graph.hpp>
+
 #include <map>
 #include <set>
 #include <vector>
 
 using namespace std;
+using boost::make_filtered_graph;
 
 namespace ue2 {
 
@@ -94,8 +99,8 @@ void findCandidates(NGHolder &g, const vector<NFAVertex> &ordering,
         // For `v' to be a candidate, its predecessors must all have the same
         // successor set as `v'.
 
-        set<NFAVertex> succ_v, succ_u;
-        succ(g, v, &succ_v);
+        auto succ_v = succs(v, g);
+        flat_set<NFAVertex> succ_u;
 
         for (auto u : inv_adjacent_vertices_range(v, g)) {
             succ_u.clear();
@@ -104,7 +109,7 @@ void findCandidates(NGHolder &g, const vector<NFAVertex> &ordering,
                 goto next_cand;
             }
         }
-        DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
+        DEBUG_PRINTF("vertex %zu is a candidate\n", g[v].index);
         cand->push_back(v);
     next_cand:;
     }
@@ -125,8 +130,8 @@ void findCandidates_rev(NGHolder &g, const vector<NFAVertex> &ordering,
         // For `v' to be a candidate, its predecessors must all have the same
         // successor set as `v'.
 
-        set<NFAVertex> pred_v, pred_u;
-        pred(g, v, &pred_v);
+        auto pred_v = preds(v, g);
+        flat_set<NFAVertex> pred_u;
 
         for (auto u : adjacent_vertices_range(v, g)) {
             pred_u.clear();
@@ -135,7 +140,7 @@ void findCandidates_rev(NGHolder &g, const vector<NFAVertex> &ordering,
                 goto next_cand;
             }
         }
-        DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
+        DEBUG_PRINTF("vertex %zu is a candidate\n", g[v].index);
         cand->push_back(v);
     next_cand:;
     }
@@ -172,8 +177,7 @@ void succCRIntersection(const NGHolder &g, NFAVertex v, CharReach &add) {
 static
 set<NFAVertex> findSustainSet(const NGHolder &g, NFAVertex p,
                               bool ignore_starts, const CharReach &new_cr) {
-    set<NFAVertex> cand;
-    pred(g, p, &cand);
+    auto cand = preds<set<NFAVertex>>(p, g);
     if (ignore_starts) {
         cand.erase(g.startDs);
     }
@@ -209,8 +213,7 @@ set<NFAVertex> findSustainSet(const NGHolder &g, NFAVertex p,
 static
 set<NFAVertex> findSustainSet_rev(const NGHolder &g, NFAVertex p,
                                   const CharReach &new_cr) {
-    set<NFAVertex> cand;
-    succ(g, p, &cand);
+    auto cand = succs<set<NFAVertex>>(p, g);
     /* remove elements from cand until the sustain set property holds */
     bool changed;
     do {
@@ -240,7 +243,7 @@ set<NFAVertex> findSustainSet_rev(const NGHolder &g, NFAVertex p,
 
 static
 bool enlargeCyclicVertex(NGHolder &g, som_type som, NFAVertex v) {
-    DEBUG_PRINTF("considering vertex %u\n", g[v].index);
+    DEBUG_PRINTF("considering vertex %zu\n", g[v].index);
     const CharReach &v_cr = g[v].char_reach;
 
     CharReach add;
@@ -259,7 +262,7 @@ bool enlargeCyclicVertex(NGHolder &g, som_type som, NFAVertex v) {
         if (p == v) {
             continue;
         }
-        DEBUG_PRINTF("looking at pred %u\n", g[p].index);
+        DEBUG_PRINTF("looking at pred %zu\n", g[p].index);
 
         bool ignore_sds = som; /* if we are tracking som, entries into a state
                                   from sds are significant. */
@@ -289,13 +292,13 @@ bool enlargeCyclicVertex(NGHolder &g, som_type som, NFAVertex v) {
 
     /* the cr can be increased */
     g[v].char_reach = add;
-    DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
+    DEBUG_PRINTF("vertex %zu was widened\n", g[v].index);
     return true;
 }
 
 static
 bool enlargeCyclicVertex_rev(NGHolder &g, NFAVertex v) {
-    DEBUG_PRINTF("considering vertex %u\n", g[v].index);
+    DEBUG_PRINTF("considering vertex %zu\n", g[v].index);
     const CharReach &v_cr = g[v].char_reach;
 
     CharReach add;
@@ -314,7 +317,7 @@ bool enlargeCyclicVertex_rev(NGHolder &g, NFAVertex v) {
         if (p == v) {
             continue;
         }
-        DEBUG_PRINTF("looking at succ %u\n", g[p].index);
+        DEBUG_PRINTF("looking at succ %zu\n", g[p].index);
 
         set<NFAVertex> sustain = findSustainSet_rev(g, p, add);
         DEBUG_PRINTF("sustain set is %zu\n", sustain.size());
@@ -339,7 +342,7 @@ bool enlargeCyclicVertex_rev(NGHolder &g, NFAVertex v) {
 
     /* the cr can be increased */
     g[v].char_reach = add;
-    DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
+    DEBUG_PRINTF("vertex %zu was widened\n", g[v].index);
     return true;
 }
 
@@ -388,7 +391,7 @@ bool improveGraph(NGHolder &g, som_type som) {
  * enlargeCyclicCR. */
 CharReach reduced_cr(NFAVertex v, const NGHolder &g,
                      const map<NFAVertex, BoundedRepeatSummary> &br_cyclic) {
-    DEBUG_PRINTF("find minimal cr for %u\n", g[v].index);
+    DEBUG_PRINTF("find minimal cr for %zu\n", g[v].index);
     CharReach v_cr = g[v].char_reach;
     if (proper_in_degree(v, g) != 1) {
         return v_cr;
@@ -546,4 +549,165 @@ bool mergeCyclicDotStars(NGHolder &g) {
     return true;
 }
 
+/**
+ * Returns the set of vertices that cannot be on if v is not on.
+ */
+static
+flat_set<NFAVertex> findDependentVertices(const NGHolder &g, NFAVertex v) {
+    auto v_pred = preds(v, g);
+    flat_set<NFAVertex> may_be_on;
+
+    /* We need to exclude any vertex that may be reached on a path which is
+     * incompatible with the vertex v being on. */
+
+    /* A vertex u is bad if:
+     * 1) its reach may be incompatible with v (not a subset)
+     * 2) it if there is an edge from a bad vertex b and there is either not an
+     *     edge v->u or not an edge b->v.
+     * Note: 2) means v is never bad as it has a selfloop
+     *
+     * Can do this with a DFS from all the initial bad states with a conditional
+     * check down edges. Alternately can just filter these edges out of the
+     * graph first.
+     */
+    flat_set<NFAEdge> no_explore;
+    for (NFAVertex t : adjacent_vertices_range(v, g)) {
+        for (NFAEdge e : in_edges_range(t, g)) {
+            NFAVertex s = source(e, g);
+            if (edge(s, v, g).second) {
+                no_explore.insert(e);
+            }
+        }
+    }
+
+    auto filtered_g = make_filtered_graph(g, make_bad_edge_filter(&no_explore));
+
+    vector<boost::default_color_type> color_raw(num_vertices(g));
+    auto color = make_iterator_property_map(color_raw.begin(),
+                                            get(vertex_index, g));
+    flat_set<NFAVertex> bad;
+    for (NFAVertex b : vertices_range(g)) {
+        if (b != g.start && g[b].char_reach.isSubsetOf(g[v].char_reach)) {
+            continue;
+        }
+        boost::depth_first_visit(filtered_g, b, make_vertex_recorder(bad),
+                                 color);
+    }
+
+    flat_set<NFAVertex> rv;
+    for (NFAVertex u : vertices_range(g)) {
+        if (!contains(bad, u)) {
+            DEBUG_PRINTF("%zu is good\n", g[u].index);
+            rv.insert(u);
+        }
+    }
+    return rv;
+}
+
+static
+bool willBeEnabledConcurrently(NFAVertex main_cyclic, NFAVertex v,
+                               const NGHolder &g) {
+    return is_subset_of(preds(main_cyclic, g), preds(v, g));
+}
+
+static
+bool sometimesEnabledConcurrently(NFAVertex main_cyclic, NFAVertex v,
+                                  const NGHolder &g) {
+    return has_intersection(preds(main_cyclic, g), preds(v, g));
+}
+
+static
+bool pruneUsingSuccessors(NGHolder &g, NFAVertex u, som_type som) {
+    if (som && (is_virtual_start(u, g) || u == g.startDs)) {
+        return false;
+    }
+
+    bool changed = false;
+    DEBUG_PRINTF("using cyclic %zu as base\n", g[u].index);
+    auto children = findDependentVertices(g, u);
+    vector<NFAVertex> u_succs;
+    for (NFAVertex v : adjacent_vertices_range(u, g)) {
+        if (som && is_virtual_start(v, g)) {
+            /* as v is virtual start, its som has been reset so can not override
+             * existing in progress matches. */
+            continue;
+        }
+        u_succs.push_back(v);
+    }
+    stable_sort(u_succs.begin(), u_succs.end(),
+         [&](NFAVertex a, NFAVertex b) {
+             return g[a].char_reach.count() > g[b].char_reach.count();
+         });
+    for (NFAVertex v : u_succs) {
+        DEBUG_PRINTF("    using %zu as killer\n", g[v].index);
+        /* Need to distinguish between vertices that are switched on after the
+         * cyclic vs vertices that are switched on concurrently with the cyclic
+         * if (subject to a suitable reach) */
+        bool v_peer_of_cyclic = willBeEnabledConcurrently(u, v, g);
+        set<NFAEdge> dead;
+        for (NFAVertex s : adjacent_vertices_range(v, g)) {
+            DEBUG_PRINTF("        looking at preds of %zu\n", g[s].index);
+            for (NFAEdge e : in_edges_range(s, g)) {
+                NFAVertex p = source(e, g);
+                if (!contains(children, p) || p == v || p == u
+                    || p == g.accept) {
+                    DEBUG_PRINTF("%zu not a cand\n", g[p].index);
+                    continue;
+                }
+                if (is_any_accept(s, g) && g[p].reports != g[v].reports) {
+                    DEBUG_PRINTF("%zu bad reports\n", g[p].index);
+                    continue;
+                }
+                /* the out-edges of a vertex that may be enabled on the same
+                 * byte as the cyclic can only be killed by the out-edges of a
+                 * peer vertex which will be enabled with the cyclic (a non-peer
+                 * may not be switched on until another byte is processed). */
+                if (!v_peer_of_cyclic
+                    && sometimesEnabledConcurrently(u, p, g)) {
+                    DEBUG_PRINTF("%zu can only be squashed by a proper peer\n",
+                                 g[p].index);
+                   continue;
+                }
+
+                if (g[p].char_reach.isSubsetOf(g[v].char_reach)) {
+                    dead.insert(e);
+                    changed = true;
+                    DEBUG_PRINTF("removing edge %zu->%zu\n", g[p].index,
+                                  g[s].index);
+                } else if (is_subset_of(succs(p, g), succs(u, g))) {
+                    if (is_match_vertex(p, g)
+                        && !is_subset_of(g[p].reports, g[v].reports)) {
+                        continue;
+                    }
+                    DEBUG_PRINTF("updating reach on %zu\n", g[p].index);
+                    changed |= (g[p].char_reach & g[v].char_reach).any();
+                    g[p].char_reach &= ~g[v].char_reach;
+                }
+
+            }
+        }
+        remove_edges(dead, g);
+    }
+
+    DEBUG_PRINTF("changed %d\n", (int)changed);
+    return changed;
+}
+
+bool prunePathsRedundantWithSuccessorOfCyclics(NGHolder &g, som_type som) {
+    /* TODO: the reverse form of this is also possible */
+    bool changed = false;
+    for (NFAVertex v : vertices_range(g)) {
+        if (hasSelfLoop(v, g) && g[v].char_reach.all()) {
+            changed |= pruneUsingSuccessors(g, v, som);
+        }
+    }
+
+    if (changed) {
+        pruneUseless(g);
+        clearReports(g);
+    }
+
+    return changed;
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_misc_opt.h b/src/nfagraph/ng_misc_opt.h
index 4955c7af..5ed089dc 100644
--- a/src/nfagraph/ng_misc_opt.h
+++ b/src/nfagraph/ng_misc_opt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,6 +72,13 @@ std::vector<CharReach> reduced_cr(const NGHolder &g,
 /** Remove cyclic stars connected to start */
 bool mergeCyclicDotStars(NGHolder &g);
 
+/**
+ * Given a cyclic state 'c' with a broad reach and a later state 'v' that is
+ * only reachable if c is still on, then any edges to a successor of a direct
+ * successor of c with reach a superset of v are redundant.
+ */
+bool prunePathsRedundantWithSuccessorOfCyclics(NGHolder &h, som_type som);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_netflow.cpp b/src/nfagraph/ng_netflow.cpp
index 9004024f..cff26358 100644
--- a/src/nfagraph/ng_netflow.cpp
+++ b/src/nfagraph/ng_netflow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -92,7 +92,7 @@ void addReverseEdges(NGHolder &g, vector<NFAEdge> &reverseEdge,
         if (it == allEdges.end()) {
             // No reverse edge, add one.
             NFAVertex u = source(fwd, g), v = target(fwd, g);
-            NFAEdge rev = add_edge(v, u, g).first;
+            NFAEdge rev = add_edge(v, u, g);
             it = allEdges.insert(make_pair(make_pair(vidx, uidx), rev)).first;
             // Add to capacity map.
             u32 revIndex = g[rev].index;
@@ -111,6 +111,7 @@ static
 void removeEdgesFromIndex(NGHolder &g, vector<u64a> &capacityMap, u32 idx) {
     remove_edge_if([&](const NFAEdge &e) { return g[e].index >= idx; }, g);
     capacityMap.resize(idx);
+    renumber_edges(g);
 }
 
 /** A wrapper around boykov_kolmogorov_max_flow, returns the max flow and
@@ -142,11 +143,10 @@ u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
     vector<s32> distances(numVertices);
     assert(colorMap.size() == numVertices);
 
-    const NFAGraph &g = h.g;
-    auto v_index_map = get(&NFAGraphVertexProps::index, g);
-    auto e_index_map = get(&NFAGraphEdgeProps::index, g);
+    auto v_index_map = get(vertex_index, h);
+    auto e_index_map = get(edge_index, h);
 
-    u64a flow = boykov_kolmogorov_max_flow(g,
+    u64a flow = boykov_kolmogorov_max_flow(h,
          make_iterator_property_map(capacityMap.begin(), e_index_map),
          make_iterator_property_map(edgeResiduals.begin(), e_index_map),
          make_iterator_property_map(reverseEdges.begin(), e_index_map),
@@ -158,7 +158,7 @@ u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
 
     // Remove reverse edges from graph.
     removeEdgesFromIndex(h, capacityMap, numRealEdges);
-    assert(num_edges(h.g) == numRealEdges);
+    assert(num_edges(h) == numRealEdges);
 
     DEBUG_PRINTF("flow = %llu\n", flow);
     return flow;
@@ -190,14 +190,14 @@ vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
 
         if (fromColor != boost::white_color && toColor == boost::white_color) {
             assert(ec <= INVALID_EDGE_CAP);
-            DEBUG_PRINTF("found white cut edge %u->%u cap %llu\n",
+            DEBUG_PRINTF("found white cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
             observed_white_flow += ec;
             picked_white.push_back(e);
         }
         if (fromColor == boost::black_color && toColor != boost::black_color) {
             assert(ec <= INVALID_EDGE_CAP);
-            DEBUG_PRINTF("found black cut edge %u->%u cap %llu\n",
+            DEBUG_PRINTF("found black cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
             observed_black_flow += ec;
             picked_black.push_back(e);
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index 8abc45b3..012b4e8d 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -220,13 +220,7 @@ void copyInEdges(NGHolder &g, NFAVertex from, NFAVertex to,
             continue;
         }
 
-        // Check with edge_by_target to cope with predecessors with large
-        // fan-out.
-        if (edge_by_target(u, to, g).second) {
-            continue;
-        }
-
-        add_edge(u, to, g[e], g);
+        add_edge_if_not_present(u, to, g[e], g);
     }
 }
 
@@ -361,7 +355,7 @@ void reduceRegions(NGHolder &h) {
     // We may have vertices that have edges to both accept and acceptEod: in
     // this case, we can optimize for performance by removing the acceptEod
     // edges.
-    remove_in_edge_if(h.acceptEod, SourceHasEdgeToAccept(h), h.g);
+    remove_in_edge_if(h.acceptEod, SourceHasEdgeToAccept(h), h);
 }
 
 void prefilterReductions(NGHolder &h, const CompileContext &cc) {
@@ -378,13 +372,13 @@ void prefilterReductions(NGHolder &h, const CompileContext &cc) {
     DEBUG_PRINTF("before: graph with %zu vertices, %zu edges\n",
                  num_vertices(h), num_edges(h));
 
-    h.renumberVertices();
-    h.renumberEdges();
+    renumber_vertices(h);
+    renumber_edges(h);
 
     reduceRegions(h);
 
-    h.renumberVertices();
-    h.renumberEdges();
+    renumber_vertices(h);
+    renumber_edges(h);
 
     DEBUG_PRINTF("after: graph with %zu vertices, %zu edges\n",
                  num_vertices(h), num_edges(h));
diff --git a/src/nfagraph/ng_prune.cpp b/src/nfagraph/ng_prune.cpp
index 473b9586..88f1880f 100644
--- a/src/nfagraph/ng_prune.cpp
+++ b/src/nfagraph/ng_prune.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -57,9 +57,8 @@ namespace ue2 {
 void pruneUnreachable(NGHolder &g) {
     deque<NFAVertex> dead;
 
-    if (!hasGreaterInDegree(1, g.acceptEod, g) &&
-            !hasGreaterInDegree(0, g.accept, g) &&
-            edge(g.accept, g.acceptEod, g).second) {
+    if (in_degree(g.acceptEod, g) == 1 && !in_degree(g.accept, g)
+        && edge(g.accept, g.acceptEod, g).second) {
         // Trivial case: there are no in-edges to our accepts (other than
         // accept->acceptEod), so all non-specials are unreachable.
         for (auto v : vertices_range(g)) {
@@ -70,10 +69,10 @@ void pruneUnreachable(NGHolder &g) {
     } else {
         // Walk a reverse graph from acceptEod with Boost's depth_first_visit
         // call.
-        typedef reverse_graph<NFAGraph, NFAGraph&> RevNFAGraph;
-        RevNFAGraph revg(g.g);
+        typedef reverse_graph<NGHolder, NGHolder &> RevNFAGraph;
+        RevNFAGraph revg(g);
 
-        map<NFAVertex, default_color_type> colours;
+        map<RevNFAGraph::vertex_descriptor, default_color_type> colours;
 
         depth_first_visit(revg, g.acceptEod,
                           make_dfs_visitor(boost::null_visitor()),
@@ -104,7 +103,8 @@ void pruneUnreachable(NGHolder &g) {
 
 template<class nfag_t>
 static
-bool pruneForwardUseless(NGHolder &h, const nfag_t &g, NFAVertex s,
+bool pruneForwardUseless(NGHolder &h, const nfag_t &g,
+                         typename nfag_t::vertex_descriptor s,
                          vector<default_color_type> &vertexColor) {
     // Begin with all vertices set to white, as DFV only marks visited
     // vertices.
@@ -122,9 +122,9 @@ bool pruneForwardUseless(NGHolder &h, const nfag_t &g, NFAVertex s,
     for (auto v : vertices_range(g)) {
         u32 idx = g[v].index;
         if (!is_special(v, g) && vertexColor[idx] == boost::white_color) {
-            DEBUG_PRINTF("vertex %u is unreachable from %u\n",
+            DEBUG_PRINTF("vertex %zu is unreachable from %zu\n",
                          g[v].index, g[s].index);
-            dead.push_back(v);
+            dead.push_back(NFAVertex(v));
         }
     }
 
@@ -145,17 +145,17 @@ void pruneUseless(NGHolder &g, bool renumber) {
     assert(hasCorrectlyNumberedVertices(g));
     vector<default_color_type> vertexColor(num_vertices(g));
 
-    bool work_done = pruneForwardUseless(g, g.g, g.start, vertexColor);
-    work_done |= pruneForwardUseless(
-        g, reverse_graph<NFAGraph, NFAGraph &>(g.g), g.acceptEod, vertexColor);
+    bool work_done = pruneForwardUseless(g, g, g.start, vertexColor);
+    work_done |= pruneForwardUseless(g, reverse_graph<NGHolder, NGHolder &>(g),
+                                     g.acceptEod, vertexColor);
 
     if (!work_done) {
         return;
     }
 
     if (renumber) {
-        g.renumberEdges();
-        g.renumberVertices();
+        renumber_edges(g);
+        renumber_vertices(g);
     }
 }
 
@@ -172,7 +172,7 @@ void pruneEmptyVertices(NGHolder &g) {
 
         const CharReach &cr = g[v].char_reach;
         if (cr.none()) {
-            DEBUG_PRINTF("empty: %u\n", g[v].index);
+            DEBUG_PRINTF("empty: %zu\n", g[v].index);
             dead.push_back(v);
         }
     }
@@ -234,7 +234,7 @@ bool isDominatedByReporter(const NGHolder &g,
         // Note: reporters with edges only to acceptEod are not considered to
         // dominate.
         if (edge(u, g.accept, g).second && contains(g[u].reports, report_id)) {
-            DEBUG_PRINTF("%u is dominated by %u, and both report %u\n",
+            DEBUG_PRINTF("%zu is dominated by %zu, and both report %u\n",
                           g[v].index, g[u].index, report_id);
             return true;
         }
@@ -296,7 +296,7 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
     }
 
 
-    sort(begin(reporters), end(reporters), make_index_ordering(g));
+    sort(begin(reporters), end(reporters));
     reporters.erase(unique(begin(reporters), end(reporters)), end(reporters));
 
     DEBUG_PRINTF("%zu vertices have simple exhaustible reports\n",
@@ -315,14 +315,14 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
                 continue;
             }
             if (isDominatedByReporter(g, dom, v, report_id)) {
-                DEBUG_PRINTF("removed dominated report %u from vertex %u\n",
+                DEBUG_PRINTF("removed dominated report %u from vertex %zu\n",
                              report_id, g[v].index);
                 g[v].reports.erase(report_id);
             }
         }
 
         if (g[v].reports.empty()) {
-            DEBUG_PRINTF("removed edges to accepts from %u, no reports left\n",
+            DEBUG_PRINTF("removed edges to accepts from %zu, no reports left\n",
                           g[v].index);
             remove_edge(v, g.accept, g);
             remove_edge(v, g.acceptEod, g);
@@ -337,7 +337,7 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
         if (hasOnlySelfLoopAndExhaustibleAccepts(g, rm, v)) {
             remove_edge(v, v, g);
             modified = true;
-            DEBUG_PRINTF("removed self-loop on %u\n", g[v].index);
+            DEBUG_PRINTF("removed self-loop on %zu\n", g[v].index);
         }
     }
 
@@ -349,7 +349,7 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
 
     // We may have only removed self-loops, in which case pruneUseless wouldn't
     // renumber, so we do edge renumbering explicitly here.
-    g.renumberEdges();
+    renumber_edges(g);
 }
 
 /** Removes the given Report ID from vertices connected to accept, and then
@@ -388,8 +388,8 @@ void pruneReport(NGHolder &g, ReportID report) {
 
     remove_edges(dead, g);
     pruneUnreachable(g);
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
 }
 
 /** Removes all Report IDs bar the given one from vertices connected to accept,
@@ -431,8 +431,8 @@ void pruneAllOtherReports(NGHolder &g, ReportID report) {
 
     remove_edges(dead, g);
     pruneUnreachable(g);
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 00b2e8ac..7281471f 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -59,7 +59,7 @@ static
 size_t countChain(const NGHolder &g, NFAVertex v) {
     size_t count = 0;
     while (v) {
-        DEBUG_PRINTF("counting vertex %u\n", g[v].index);
+        DEBUG_PRINTF("counting vertex %zu\n", g[v].index);
         if (is_special(v, g)) {
             break;
         }
@@ -79,7 +79,7 @@ void wireNewAccepts(NGHolder &g, NFAVertex head,
             continue;
         }
 
-        DEBUG_PRINTF("adding edge: %u -> accept\n", g[u].index);
+        DEBUG_PRINTF("adding edge: %zu -> accept\n", g[u].index);
         assert(!edge(u, g.accept, g).second);
         assert(!edge(u, g.acceptEod, g).second);
         add_edge(u, g.accept, g);
@@ -136,13 +136,13 @@ bool singleStart(const NGHolder &g) {
 
     for (auto v : adjacent_vertices_range(g.start, g)) {
         if (!is_special(v, g)) {
-            DEBUG_PRINTF("saw %u\n", g[v].index);
+            DEBUG_PRINTF("saw %zu\n", g[v].index);
             seen.insert(v);
         }
     }
     for (auto v : adjacent_vertices_range(g.startDs, g)) {
         if (!is_special(v, g)) {
-            DEBUG_PRINTF("saw %u\n", g[v].index);
+            DEBUG_PRINTF("saw %zu\n", g[v].index);
             seen.insert(v);
         }
     }
@@ -158,7 +158,7 @@ bool triggerResetsPuff(const NGHolder &g, NFAVertex head) {
 
     for (auto u : inv_adjacent_vertices_range(head, g)) {
         if (!g[u].char_reach.isSubsetOf(puff_escapes)) {
-            DEBUG_PRINTF("no reset on trigger %u %u\n", g[u].index,
+            DEBUG_PRINTF("no reset on trigger %zu %zu\n", g[u].index,
                          g[head].index);
             return false;
         }
@@ -172,7 +172,7 @@ bool triggerResetsPuff(const NGHolder &g, NFAVertex head) {
  * */
 static
 bool triggerFloodsPuff(const NGHolder &g, NFAVertex head) {
-    DEBUG_PRINTF("head = %u\n", g[head].index);
+    DEBUG_PRINTF("head = %zu\n", g[head].index);
 
     const CharReach &puff_cr = g[head].char_reach;
 
@@ -186,14 +186,14 @@ bool triggerFloodsPuff(const NGHolder &g, NFAVertex head) {
     if (proper_in_degree(head, g) == 1
         && puff_cr == g[getSoleSourceVertex(g, head)].char_reach) {
         head = getSoleSourceVertex(g, head);
-        DEBUG_PRINTF("temp new head = %u\n", g[head].index);
+        DEBUG_PRINTF("temp new head = %zu\n", g[head].index);
     }
 
     for (auto s : inv_adjacent_vertices_range(head, g)) {
-        DEBUG_PRINTF("s = %u\n", g[s].index);
+        DEBUG_PRINTF("s = %zu\n", g[s].index);
         if (!puff_cr.isSubsetOf(g[s].char_reach)) {
-            DEBUG_PRINTF("no flood on trigger %u %u\n",
-                         g[s].index, g[head].index);
+            DEBUG_PRINTF("no flood on trigger %zu %zu\n", g[s].index,
+                         g[head].index);
             return false;
         }
 
@@ -268,7 +268,7 @@ void constructPuff(NGHolder &g, const NFAVertex a, const NFAVertex puffv,
                    RoseBuild &rose, ReportManager &rm,
                    flat_set<ReportID> &chain_reports, bool prefilter) {
     DEBUG_PRINTF("constructing Puff for report %u\n", report);
-    DEBUG_PRINTF("a = %u\n", g[a].index);
+    DEBUG_PRINTF("a = %zu\n", g[a].index);
 
     const Report &puff_report = rm.getReport(report);
     const bool simple_exhaust = isSimpleExhaustible(puff_report);
@@ -349,7 +349,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
         }
 
         nodes.push_back(a);
-        DEBUG_PRINTF("vertex %u has in_degree %zu\n", g[a].index,
+        DEBUG_PRINTF("vertex %zu has in_degree %zu\n", g[a].index,
                      in_degree(a, g));
 
         a = getSoleSourceVertex(g, a);
@@ -387,10 +387,10 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
 
     bool auto_restart = false;
 
-    DEBUG_PRINTF("a = %u\n", g[a].index);
+    DEBUG_PRINTF("a = %zu\n", g[a].index);
 
     if (nodes.size() < MIN_PUFF_LENGTH || a == g.startDs) {
-        DEBUG_PRINTF("bad %zu %u\n", nodes.size(), g[a].index);
+        DEBUG_PRINTF("bad %zu %zu\n", nodes.size(), g[a].index);
         if (nodes.size() < MIN_PUFF_LENGTH) {
             return false;
         } else {
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index 26599251..76bc93da 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -307,16 +307,10 @@ void markForRemoval(const NFAVertex v, VertexInfoMap &infoMap,
 
 static
 bool hasInEdgeTops(const NGHolder &g, NFAVertex v) {
-    bool exists;
-    NFAEdge e;
-    tie(e, exists) = edge_by_target(g.start, v, g);
-    if (exists && g[e].top != 0) {
-        return true;
-    }
-    return false;
+    NFAEdge e = edge(g.start, v, g);
+    return e && !g[e].tops.empty();
 }
 
-
 /** Transform (1), removal of redundant vertices. */
 static
 bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
@@ -348,8 +342,7 @@ bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
         }
 
         if (info.pred.empty() || info.succ.empty()) {
-            DEBUG_PRINTF("vertex %u has empty pred/succ list\n",
-                         g[v].index);
+            DEBUG_PRINTF("vertex %zu has empty pred/succ list\n", g[v].index);
             assert(0); // non-special states should always have succ/pred lists
             continue;
         }
@@ -448,7 +441,7 @@ bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
 
             CharReach &otherReach = g[t].char_reach;
             if (currReach.isSubsetOf(otherReach)) {
-                DEBUG_PRINTF("removing redundant vertex %u (keeping %u)\n",
+                DEBUG_PRINTF("removing redundant vertex %zu (keeping %zu)\n",
                              g[v].index, g[t].index);
                 markForRemoval(v, infoMap, removable);
                 changed = true;
@@ -539,9 +532,6 @@ bool doDiamondMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
             continue;
         }
 
-        /* ensure that we look for candidates in the same order */
-        sort(intersection.begin(), intersection.end(), make_index_ordering(g));
-
         const CharReach &currReach = g[v].char_reach;
         const auto &currReports = g[v].reports;
         for (auto t : intersection) {
@@ -578,8 +568,8 @@ bool doDiamondMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
                 CharReach &otherReach = g[t].char_reach;
                 otherReach |= currReach;
                 // v can be removed
-                DEBUG_PRINTF("removing redundant vertex %u and merging "
-                             "reachability with vertex %u\n",
+                DEBUG_PRINTF("removing redundant vertex %zu and merging "
+                             "reachability with vertex %zu\n",
                              g[v].index, g[t].index);
                 markForRemoval(v, infoMap, removable);
                 changed = true;
@@ -645,14 +635,14 @@ bool reversePathReachSubset(const NFAEdge &e, const NFAVertex &dom,
     }
 
     NFAVertex start = source(e, g);
-    using RevGraph = boost::reverse_graph<NFAGraph, const NFAGraph &>;
+    using RevGraph = boost::reverse_graph<NGHolder, const NGHolder &>;
     map<RevGraph::vertex_descriptor, boost::default_color_type> vertexColor;
 
     // Walk the graph backwards from v, examining each node. We fail (return
     // false) if we encounter a node with reach NOT a subset of domReach, and
     // we stop searching at dom.
     try {
-        depth_first_visit(RevGraph(g.g), start,
+        depth_first_visit(RevGraph(g), start,
                           ReachSubsetVisitor(domReach),
                           make_assoc_property_map(vertexColor),
                           VertexIs<RevGraph, RevGraph::vertex_descriptor>(dom));
@@ -674,16 +664,15 @@ bool forwardPathReachSubset(const NFAEdge &e, const NFAVertex &dom,
     }
 
     NFAVertex start = target(e, g);
-    map<NFAGraph::vertex_descriptor, boost::default_color_type> vertexColor;
+    map<NFAVertex, boost::default_color_type> vertexColor;
 
     // Walk the graph forward from v, examining each node. We fail (return
     // false) if we encounter a node with reach NOT a subset of domReach, and
     // we stop searching at dom.
     try {
-        depth_first_visit(g.g, start,
-                          ReachSubsetVisitor(domReach),
+        depth_first_visit(g, start, ReachSubsetVisitor(domReach),
                           make_assoc_property_map(vertexColor),
-                          VertexIs<NFAGraph, NFAVertex>(dom));
+                          VertexIs<NGHolder, NFAVertex>(dom));
     } catch(ReachMismatch&) {
         return false;
     }
@@ -746,11 +735,10 @@ u32 findCyclic(const NGHolder &g, vector<bool> &cyclic) {
 
     for (auto v : vertices_range(g)) {
         assert(g[v].index < cyclic.size());
-        bool c = edge(v, v, g).second;
-        if (c) {
+        if (hasSelfLoop(v, g)) {
             count++;
+            cyclic[g[v].index] = true;
         }
-        cyclic[g[v].index] = c;
     }
 
     return count;
@@ -775,9 +763,8 @@ void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
                 continue;
             }
 
-            DEBUG_PRINTF("vertex %u is dominated by directly-connected cyclic "
-                         "vertex %u\n", g[v].index,
-                         g[dom].index);
+            DEBUG_PRINTF("vertex %zu is dominated by directly-connected cyclic "
+                         "vertex %zu\n", g[v].index, g[dom].index);
 
             // iff all paths through in-edge e of v involve vertices whose
             // reachability is a subset of reach(dom), we can delete edge e.
@@ -787,8 +774,8 @@ void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
                 }
 
                 if (reversePathReachSubset(e, dom, g)) {
-                    DEBUG_PRINTF("edge (%u, %u) can be removed: leading paths "
-                                 "share dom reach\n",
+                    DEBUG_PRINTF("edge (%zu, %zu) can be removed: leading "
+                                 "paths share dom reach\n",
                                  g[source(e, g)].index, g[target(e, g)].index);
                     dead.insert(e);
                     if (source(e, g) == v) {
@@ -814,11 +801,9 @@ void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
 
         // Path out through a post-dominator (e.g. a?.+foobar')
         NFAVertex postdom = postdominators[v];
-        if (postdom && cyclic[g[postdom].index]
-            && edge(v, postdom, g).second) {
-            DEBUG_PRINTF("vertex %u is postdominated by directly-connected "
-                         "cyclic vertex %u\n", g[v].index,
-                         g[postdom].index);
+        if (postdom && cyclic[g[postdom].index] && edge(v, postdom, g).second) {
+            DEBUG_PRINTF("vertex %zu is postdominated by directly-connected "
+                         "cyclic vertex %zu\n", g[v].index, g[postdom].index);
 
             // iff all paths through in-edge e of v involve vertices whose
             // reachability is a subset of reach(dom), we can delete edge e.
@@ -828,8 +813,8 @@ void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
                 }
 
                 if (forwardPathReachSubset(e, postdom, g)) {
-                    DEBUG_PRINTF("edge (%u, %u) can be removed: trailing paths "
-                                 "share postdom reach\n",
+                    DEBUG_PRINTF("edge (%zu, %zu) can be removed: trailing "
+                                 "paths share postdom reach\n",
                                  g[source(e, g)].index, g[target(e, g)].index);
                     if (target(e, g) == v) {
                         cyclic[g[v].index] = false;
@@ -844,7 +829,7 @@ void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
 
 bool removeRedundancy(NGHolder &g, som_type som) {
     DEBUG_PRINTF("rr som = %d\n", (int)som);
-    g.renumberVertices();
+    renumber_vertices(g);
 
     // Cheap check: if all the non-special vertices have in-degree one and
     // out-degree one, there's no redundancy in this here graph and we can
diff --git a/src/nfagraph/ng_region.cpp b/src/nfagraph/ng_region.cpp
index 124e9fa5..0ecd7bd6 100644
--- a/src/nfagraph/ng_region.cpp
+++ b/src/nfagraph/ng_region.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,7 +71,7 @@ using namespace std;
 namespace ue2 {
 
 typedef ue2::unordered_set<NFAEdge> BackEdgeSet;
-typedef boost::filtered_graph<NFAGraph, AcyclicFilter<BackEdgeSet>>
+typedef boost::filtered_graph<NGHolder, bad_edge_filter<BackEdgeSet>>
     AcyclicGraph;
 
 namespace {
@@ -92,17 +92,17 @@ void checkAndAddExitCandidate(const AcyclicGraph &g,
 
     /* find the set of vertices reachable from v which are not in r */
     for (auto w : adjacent_vertices_range(v, g)) {
-        if (!contains(r, w)) {
+        if (!contains(r, NFAVertex(w))) {
             if (!open) {
-                exits->push_back(exit_info(v));
+                exits->push_back(exit_info(NFAVertex(v)));
                 open = &exits->back().open;
             }
-            open->insert(w);
+            open->insert(NFAVertex(w));
         }
     }
 
     if (open) {
-        DEBUG_PRINTF("exit %u\n", g[v].index);
+        DEBUG_PRINTF("exit %zu\n", g[v].index);
     }
 }
 
@@ -141,7 +141,7 @@ bool exitValid(UNUSED const AcyclicGraph &g, const vector<exit_info> &exits,
         return true;
     }
     if (exits.size() == 1 && open_jumps.size() == 1) {
-        DEBUG_PRINTF("oj %u, e %u\n", g[*open_jumps.begin()].index,
+        DEBUG_PRINTF("oj %zu, e %zu\n", g[*open_jumps.begin()].index,
                      g[exits[0].exit].index);
         if (*open_jumps.begin() == exits[0].exit) {
             return true;
@@ -190,7 +190,7 @@ void buildInitialCandidate(const AcyclicGraph &g,
     if (exits->empty()) {
         DEBUG_PRINTF("odd\n");
         candidate->clear();
-        DEBUG_PRINTF("adding %u to initial\n", g[*it].index);
+        DEBUG_PRINTF("adding %zu to initial\n", g[*it].index);
         candidate->insert(*it);
         open_jumps->erase(*it);
         checkAndAddExitCandidate(g, *candidate, *it, exits);
@@ -202,7 +202,7 @@ void buildInitialCandidate(const AcyclicGraph &g,
     candidate->clear();
 
     for (; it != ite; ++it) {
-        DEBUG_PRINTF("adding %u to initial\n", g[*it].index);
+        DEBUG_PRINTF("adding %zu to initial\n", g[*it].index);
         candidate->insert(*it);
         if (contains(enters, *it)) {
             break;
@@ -231,10 +231,10 @@ void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
     vector<exit_info> exits;
     ue2::unordered_set<NFAVertex> candidate;
     ue2::unordered_set<NFAVertex> open_jumps;
-    DEBUG_PRINTF("adding %u to current\n", g[*t_it].index);
+    DEBUG_PRINTF("adding %zu to current\n", g[*t_it].index);
     assert(t_it != topo.rend());
     candidate.insert(*t_it++);
-    DEBUG_PRINTF("adding %u to current\n", g[*t_it].index);
+    DEBUG_PRINTF("adding %zu to current\n", g[*t_it].index);
     assert(t_it != topo.rend());
     candidate.insert(*t_it++);
     findExits(g, candidate, &exits);
@@ -257,7 +257,7 @@ void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
                                   &open_jumps);
         } else {
             NFAVertex curr = *t_it;
-            DEBUG_PRINTF("adding %u to current\n", g[curr].index);
+            DEBUG_PRINTF("adding %zu to current\n", g[curr].index);
             candidate.insert(curr);
             open_jumps.erase(curr);
             refineExits(g, candidate, *t_it, &exits);
@@ -284,7 +284,7 @@ void mergeUnderBackEdges(const NGHolder &g, const vector<NFAVertex> &topo,
             continue;
         }
 
-        DEBUG_PRINTF("merging v = %u(%u), u = %u(%u)\n", g[v].index, rv,
+        DEBUG_PRINTF("merging v = %zu(%u), u = %zu(%u)\n", g[v].index, rv,
                      g[u].index, ru);
         assert(rv < ru);
 
@@ -350,8 +350,8 @@ void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
         }
 
         if (isLeafNode(v, acyclic_g)) {
-            DEBUG_PRINTF("sink found %u\n", acyclic_g[v].index);
-            sinks.insert(v);
+            DEBUG_PRINTF("sink found %zu\n", acyclic_g[v].index);
+            sinks.insert(NFAVertex(v));
         }
     }
 
@@ -365,18 +365,18 @@ void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
         DEBUG_PRINTF("look\n");
         changed = false;
         for (auto v : vertices_range(acyclic_g)) {
-            if (is_special(v, acyclic_g) || contains(sinks, v)) {
+            if (is_special(v, acyclic_g) || contains(sinks, NFAVertex(v))) {
                 continue;
             }
 
             for (auto w : adjacent_vertices_range(v, acyclic_g)) {
-                if (!contains(sinks, w)) {
+                if (!contains(sinks, NFAVertex(w))) {
                     goto next;
                 }
             }
 
-            DEBUG_PRINTF("sink found %u\n", acyclic_g[v].index);
-            sinks.insert(v);
+            DEBUG_PRINTF("sink found %zu\n", acyclic_g[v].index);
+            sinks.insert(NFAVertex(v));
             changed = true;
         next:;
         }
@@ -387,10 +387,10 @@ void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
             continue;
         }
         NFAVertex s = *ri;
-        DEBUG_PRINTF("handling sink %u\n", acyclic_g[s].index);
+        DEBUG_PRINTF("handling sink %zu\n", acyclic_g[s].index);
         ue2::unordered_set<NFAVertex> parents;
         for (const auto &e : in_edges_range(s, acyclic_g)) {
-            parents.insert(source(e, acyclic_g));
+            parents.insert(NFAVertex(source(e, acyclic_g)));
         }
 
         /* vertex has no children not reachable on a back edge, bubble the
@@ -417,10 +417,9 @@ vector<NFAVertex> buildTopoOrder(const NGHolder &w,
                                  vector<boost::default_color_type> &colours) {
     vector<NFAVertex> topoOrder;
 
-    topological_sort(
-        acyclic_g, back_inserter(topoOrder),
-        color_map(make_iterator_property_map(
-            colours.begin(), get(&NFAGraphVertexProps::index, acyclic_g))));
+    topological_sort(acyclic_g, back_inserter(topoOrder),
+                     color_map(make_iterator_property_map(colours.begin(),
+                                             get(vertex_index, acyclic_g))));
 
     reorderSpecials(w, acyclic_g, topoOrder);
 
@@ -432,7 +431,7 @@ vector<NFAVertex> buildTopoOrder(const NGHolder &w,
 
     DEBUG_PRINTF("TOPO ORDER\n");
     for (auto ri = topoOrder.rbegin(); ri != topoOrder.rend(); ++ri) {
-        DEBUG_PRINTF("[%u]\n", acyclic_g[*ri].index);
+        DEBUG_PRINTF("[%zu]\n", acyclic_g[*ri].index);
     }
     DEBUG_PRINTF("----------\n");
 
@@ -448,14 +447,14 @@ ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
 
     // Build an acyclic graph for this NGHolder.
     BackEdgeSet deadEdges;
-    depth_first_search(
-        g.g, visitor(BackEdges<BackEdgeSet>(deadEdges))
-                 .root_vertex(g.start)
-                 .color_map(make_iterator_property_map(
-                     colours.begin(), get(&NFAGraphVertexProps::index, g.g))));
+    depth_first_search(g,
+                       visitor(BackEdges<BackEdgeSet>(deadEdges))
+                       .root_vertex(g.start)
+                       .color_map(make_iterator_property_map(colours.begin(),
+                                          get(vertex_index, g))));
 
-    AcyclicFilter<BackEdgeSet> af(&deadEdges);
-    AcyclicGraph acyclic_g(g.g, af);
+    auto af = make_bad_edge_filter(&deadEdges);
+    AcyclicGraph acyclic_g(g, af);
 
     // Build a (reverse) topological ordering.
     vector<NFAVertex> topoOrder = buildTopoOrder(g, acyclic_g, colours);
diff --git a/src/nfagraph/ng_region.h b/src/nfagraph/ng_region.h
index 464a6838..a56933dc 100644
--- a/src/nfagraph/ng_region.h
+++ b/src/nfagraph/ng_region.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -181,7 +181,7 @@ bool isOptionalRegion(const Graph &g, NFAVertex v,
                       const ue2::unordered_map<NFAVertex, u32> &region_map) {
     assert(isRegionEntry(g, v, region_map));
 
-    DEBUG_PRINTF("check if r%u is optional (inspecting v%u)\n",
+    DEBUG_PRINTF("check if r%u is optional (inspecting v%zu)\n",
                   region_map.at(v), g[v].index);
 
     // Region zero is never optional.
@@ -198,12 +198,12 @@ bool isOptionalRegion(const Graph &g, NFAVertex v,
         if (inSameRegion(g, v, u, region_map)) {
             continue;
         }
-        DEBUG_PRINTF("  searching from u=%u\n", g[u].index);
+        DEBUG_PRINTF("  searching from u=%zu\n", g[u].index);
 
         assert(inEarlierRegion(g, v, u, region_map));
 
         for (auto w : adjacent_vertices_range(u, g)) {
-            DEBUG_PRINTF("    searching to w=%u\n", g[w].index);
+            DEBUG_PRINTF("    searching to w=%zu\n", g[w].index);
             if (inLaterRegion(g, v, w, region_map)) {
                 return true;
             }
diff --git a/src/nfagraph/ng_region_redundancy.cpp b/src/nfagraph/ng_region_redundancy.cpp
index 5cd266dc..264e4312 100644
--- a/src/nfagraph/ng_region_redundancy.cpp
+++ b/src/nfagraph/ng_region_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -90,7 +90,7 @@ void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
     CharReach cr = h[cyc].char_reach;
     auto reports = h[cyc].reports;
 
-    DEBUG_PRINTF("going forward from %u/%u\n", h[cyc].index,
+    DEBUG_PRINTF("going forward from %zu/%u\n", h[cyc].index,
                  region);
 
     map<u32, RegionInfo>::const_iterator it;
@@ -98,7 +98,7 @@ void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
         NFAVertex v = it->second.entry;
         const CharReach &region_cr = it->second.cr;
         assert(isRegionEntry(h, v, region_map) && !is_special(v, h));
-        DEBUG_PRINTF("checking %u\n", h[v].index);
+        DEBUG_PRINTF("checking %zu\n", h[v].index);
 
         if (!region_cr.isSubsetOf(cr)) {
             DEBUG_PRINTF("doesn't cover the reach of region %u\n", region);
@@ -107,8 +107,8 @@ void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
 
         if (isOptionalRegion(h, v, region_map)
             && !regionHasUnexpectedAccept(h, region, reports, region_map)) {
-            DEBUG_PRINTF("cyclic state %u leads to optional region leader %u\n",
-                         h[cyc].index, h[v].index);
+            DEBUG_PRINTF("cyclic state %zu leads to optional region leader"
+                         " %zu\n", h[cyc].index, h[v].index);
             deadRegions.insert(region);
         } else if (isSingletonRegion(h, v, region_map)) {
             /* we can use this region as straw and suck in optional regions on
@@ -136,14 +136,14 @@ void processCyclicStateReverse(NGHolder &h, NFAVertex cyc,
     CharReach cr = h[cyc].char_reach;
     auto reports = h[cyc].reports;
 
-    DEBUG_PRINTF("going back from %u/%u\n", h[cyc].index, region);
+    DEBUG_PRINTF("going back from %zu/%u\n", h[cyc].index, region);
 
     map<u32, RegionInfo>::const_iterator it;
     while ((it = info.find(--region)) != info.end()) {
         NFAVertex v = it->second.entry;
         const CharReach &region_cr = it->second.cr;
         assert(isRegionEntry(h, v, region_map) && !is_special(v, h));
-        DEBUG_PRINTF("checking %u\n", h[v].index);
+        DEBUG_PRINTF("checking %zu\n", h[v].index);
 
         if (!region_cr.isSubsetOf(cr)) {
             DEBUG_PRINTF("doesn't cover the reach of region %u\n", region);
@@ -152,7 +152,7 @@ void processCyclicStateReverse(NGHolder &h, NFAVertex cyc,
 
         if (isOptionalRegion(h, v, region_map)
             && !regionHasUnexpectedAccept(h, region, reports, region_map)) {
-            DEBUG_PRINTF("cyclic state %u trails optional region leader %u\n",
+            DEBUG_PRINTF("cyclic state %zu trails optional region leader %zu\n",
                          h[cyc].index, h[v].index);
             deadRegions.insert(region);
         } else if (isSingletonRegion(h, v, region_map)) {
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index bc7e73d3..a16e2715 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -61,6 +61,8 @@
 #include <boost/icl/interval_set.hpp>
 
 using namespace std;
+using boost::depth_first_search;
+using boost::depth_first_visit;
 
 namespace ue2 {
 
@@ -99,7 +101,7 @@ struct ReachFilter {
     const Graph *g = nullptr;
 };
 
-typedef boost::filtered_graph<NFAGraph, ReachFilter<NFAGraph> > RepeatGraph;
+typedef boost::filtered_graph<NGHolder, ReachFilter<NGHolder>> RepeatGraph;
 
 struct ReachSubgraph {
     vector<NFAVertex> vertices;
@@ -126,9 +128,11 @@ void findInitDepths(const NGHolder &g,
     }
 }
 
-template<class Graph>
 static
-void buildTopoOrder(const Graph &g, vector<NFAVertex> &topoOrder) {
+vector<NFAVertex> buildTopoOrder(const RepeatGraph &g) {
+    /* Note: RepeatGraph is a filtered version of NGHolder and still has
+     * NFAVertex as its vertex descriptor */
+
     typedef ue2::unordered_set<NFAEdge> EdgeSet;
     EdgeSet deadEdges;
 
@@ -138,13 +142,15 @@ void buildTopoOrder(const Graph &g, vector<NFAVertex> &topoOrder) {
 
     depth_first_search(g, visitor(BackEdges<EdgeSet>(deadEdges)).
                           color_map(make_assoc_property_map(colours)));
-    AcyclicFilter<EdgeSet> af(&deadEdges);
-    boost::filtered_graph<Graph, AcyclicFilter<EdgeSet> > acyclic_g(g, af);
+    auto acyclic_g = make_filtered_graph(g, make_bad_edge_filter(&deadEdges));
 
+    vector<NFAVertex> topoOrder;
     topological_sort(acyclic_g, back_inserter(topoOrder),
                      color_map(make_assoc_property_map(colours)));
 
     reverse(topoOrder.begin(), topoOrder.end());
+
+    return topoOrder;
 }
 
 static
@@ -172,7 +178,7 @@ bool roguePredecessor(const NGHolder &g, NFAVertex v,
             continue;
         }
         if (!contains(pred, u)) {
-            DEBUG_PRINTF("%u is a rogue pred\n", g[u].index);
+            DEBUG_PRINTF("%zu is a rogue pred\n", g[u].index);
             return true;
         }
 
@@ -198,7 +204,7 @@ bool rogueSuccessor(const NGHolder &g, NFAVertex v,
         }
 
         if (!contains(succ, w)) {
-            DEBUG_PRINTF("%u is a rogue succ\n", g[w].index);
+            DEBUG_PRINTF("%zu is a rogue succ\n", g[w].index);
             return true;
         }
 
@@ -215,8 +221,8 @@ bool rogueSuccessor(const NGHolder &g, NFAVertex v,
 
 static
 bool hasDifferentTops(const NGHolder &g, const vector<NFAVertex> &verts) {
-    bool found = false;
-    u32 top = 0;
+    /* TODO: check that we need this now that we allow multiple tops */
+    const flat_set<u32> *tops = nullptr;
 
     for (auto v : verts) {
         for (const auto &e : in_edges_range(v, g)) {
@@ -224,17 +230,12 @@ bool hasDifferentTops(const NGHolder &g, const vector<NFAVertex> &verts) {
             if (u != g.start && u != g.startDs) {
                 continue; // Only edges from starts have valid top properties.
             }
-            u32 t = g[e].top;
-            DEBUG_PRINTF("edge (%u,%u) with top %u\n", g[u].index,
-                         g[v].index, t);
-            assert(t < NFA_MAX_TOP_MASKS);
-            if (!found) {
-                found = true;
-                top = t;
-            } else {
-                if (t != top) {
-                    return true; // More than one top.
-                }
+            DEBUG_PRINTF("edge (%zu,%zu) with %zu tops\n", g[u].index,
+                         g[v].index, g[e].tops.size());
+            if (!tops) {
+                tops = &g[e].tops;
+            } else if (g[e].tops != *tops) {
+                return true; // More than one set of tops.
             }
         }
     }
@@ -249,14 +250,14 @@ bool vertexIsBad(const NGHolder &g, NFAVertex v,
                  const ue2::unordered_set<NFAVertex> &pred,
                  const ue2::unordered_set<NFAVertex> &succ,
                  const flat_set<ReportID> &reports) {
-    DEBUG_PRINTF("check vertex %u\n", g[v].index);
+    DEBUG_PRINTF("check vertex %zu\n", g[v].index);
 
     // We must drop any vertex that is the target of a back-edge within
     // our subgraph. The tail set contains all vertices that are after v in a
     // topo ordering.
     for (auto u : inv_adjacent_vertices_range(v, g)) {
         if (contains(tail, u)) {
-            DEBUG_PRINTF("back-edge (%u,%u) in subgraph found\n",
+            DEBUG_PRINTF("back-edge (%zu,%zu) in subgraph found\n",
                          g[u].index, g[v].index);
             return true;
         }
@@ -266,18 +267,18 @@ bool vertexIsBad(const NGHolder &g, NFAVertex v,
     // edges from *all* the vertices in pred and no other external entries.
     // Similarly for exits.
     if (roguePredecessor(g, v, involved, pred)) {
-        DEBUG_PRINTF("preds for %u not well-formed\n", g[v].index);
+        DEBUG_PRINTF("preds for %zu not well-formed\n", g[v].index);
         return true;
     }
 
     if (rogueSuccessor(g, v, involved, succ)) {
-        DEBUG_PRINTF("succs for %u not well-formed\n", g[v].index);
+        DEBUG_PRINTF("succs for %zu not well-formed\n", g[v].index);
         return true;
     }
 
     // All reporting vertices should have the same reports.
     if (is_match_vertex(v, g) && reports != g[v].reports) {
-        DEBUG_PRINTF("report mismatch to %u\n", g[v].index);
+        DEBUG_PRINTF("report mismatch to %zu\n", g[v].index);
         return true;
     }
 
@@ -297,8 +298,7 @@ void splitSubgraph(const NGHolder &g, const deque<NFAVertex> &verts,
 
     NFAUndirectedGraph ug;
     ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
-    ue2::unordered_map<u32, NFAVertex> newIdx2old;
-    createUnGraph(verts_g.g, true, true, ug, old2new, newIdx2old);
+    createUnGraph(verts_g, true, true, ug, old2new);
 
     ue2::unordered_map<NFAUndirectedVertex, u32> repeatMap;
 
@@ -523,7 +523,7 @@ bool processSubgraph(const NGHolder &g, ReachSubgraph &rsi,
         if (u == first) {
             continue; // no self-loops
         }
-        DEBUG_PRINTF("pred vertex %u\n", g[u].index);
+        DEBUG_PRINTF("pred vertex %zu\n", g[u].index);
         dist[u].insert(0);
     }
 
@@ -625,7 +625,7 @@ void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
                      vector<NFAVertex> &tugs) {
     if (allPredsInSubgraph(v, g, involved)) {
         // We can transform this vertex into a tug trigger in-place.
-        DEBUG_PRINTF("all preds in subgraph, vertex %u becomes tug\n",
+        DEBUG_PRINTF("all preds in subgraph, vertex %zu becomes tug\n",
                      g[v].index);
         add_edge(cyclic, v, g);
         tugs.push_back(v);
@@ -637,7 +637,7 @@ void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
     NFAVertex t = clone_vertex(g, v);
     depths[t] = depths[v];
 
-    DEBUG_PRINTF("there are other paths, cloned tug %u from vertex %u\n",
+    DEBUG_PRINTF("there are other paths, cloned tug %zu from vertex %zu\n",
                   g[t].index, g[v].index);
 
     tugs.push_back(t);
@@ -654,7 +654,7 @@ NFAVertex createCyclic(NGHolder &g, ReachSubgraph &rsi) {
     NFAVertex cyclic = clone_vertex(g, last);
     add_edge(cyclic, cyclic, g);
 
-    DEBUG_PRINTF("created cyclic vertex %u\n", g[cyclic].index);
+    DEBUG_PRINTF("created cyclic vertex %zu\n", g[cyclic].index);
     return cyclic;
 }
 
@@ -665,7 +665,7 @@ NFAVertex createPos(NGHolder &g, ReachSubgraph &rsi) {
 
     g[pos].char_reach = g[first].char_reach;
 
-    DEBUG_PRINTF("created pos vertex %u\n", g[pos].index);
+    DEBUG_PRINTF("created pos vertex %zu\n", g[pos].index);
     return pos;
 }
 
@@ -711,7 +711,7 @@ void unpeelNearEnd(NGHolder &g, ReachSubgraph &rsi,
 
         NFAVertex d = clone_vertex(g, last);
         depths[d] = depths[last];
-        DEBUG_PRINTF("created vertex %u\n", g[d].index);
+        DEBUG_PRINTF("created vertex %zu\n", g[d].index);
 
         for (auto v : *succs) {
             add_edge(d, v, g);
@@ -952,7 +952,7 @@ bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
             zap = it;
             break;
         } else {
-            DEBUG_PRINTF("%u is involved in another repeat\n", g[*it].index);
+            DEBUG_PRINTF("%zu is involved in another repeat\n", g[*it].index);
         }
     }
     DEBUG_PRINTF("peeling %zu vertices from front\n",
@@ -969,7 +969,7 @@ bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
             zap = it.base(); // Note: erases everything after it.
             break;
         } else {
-            DEBUG_PRINTF("%u is involved in another repeat\n", g[*it].index);
+            DEBUG_PRINTF("%zu is involved in another repeat\n", g[*it].index);
         }
     }
     DEBUG_PRINTF("peeling %zu vertices from back\n",
@@ -980,7 +980,7 @@ bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
     // no-no.
     for (auto v : rsi.vertices) {
         if (contains(created, v)) {
-            DEBUG_PRINTF("vertex %u is in another repeat\n", g[v].index);
+            DEBUG_PRINTF("vertex %zu is in another repeat\n", g[v].index);
             return false;
         }
     }
@@ -1003,7 +1003,7 @@ void peelStartDotStar(const NGHolder &g,
 
     NFAVertex first = rsi.vertices.front();
     if (depths.at(first).fromStartDotStar.min == depth(1)) {
-        DEBUG_PRINTF("peeling start front vertex %u\n", g[first].index);
+        DEBUG_PRINTF("peeling start front vertex %zu\n", g[first].index);
         rsi.vertices.erase(rsi.vertices.begin());
         reprocessSubgraph(g, grey, rsi);
     }
@@ -1012,8 +1012,8 @@ void peelStartDotStar(const NGHolder &g,
 static
 void buildReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
                          const u32 minNumVertices) {
-    const ReachFilter<NFAGraph> fil(&g.g);
-    const RepeatGraph rg(g.g, fil);
+    const ReachFilter<NGHolder> fil(&g);
+    const RepeatGraph rg(g, fil);
 
     if (!isCompBigEnough(rg, minNumVertices)) {
         DEBUG_PRINTF("component not big enough, bailing\n");
@@ -1021,19 +1021,17 @@ void buildReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
     }
 
     NFAUndirectedGraph ug;
-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
-    ue2::unordered_map<u32, NFAVertex> newIdx2old;
-    createUnGraph(rg, true, true, ug, old2new, newIdx2old);
+    unordered_map<RepeatGraph::vertex_descriptor, NFAUndirectedVertex> old2new;
+    createUnGraph(rg, true, true, ug, old2new);
 
-    ue2::unordered_map<NFAUndirectedVertex, u32> repeatMap;
+    unordered_map<NFAUndirectedVertex, u32> repeatMap;
 
     unsigned int num;
     num = connected_components(ug, make_assoc_property_map(repeatMap));
     DEBUG_PRINTF("found %u connected repeat components\n", num);
 
     // Now, we build a set of topo-ordered ReachSubgraphs.
-    vector<NFAVertex> topoOrder;
-    buildTopoOrder(rg, topoOrder);
+    vector<NFAVertex> topoOrder = buildTopoOrder(rg);
 
     rs.resize(num);
 
@@ -1084,7 +1082,7 @@ bool entered_at_fixed_offset(NFAVertex v, const NGHolder &g,
     if (is_triggered(g) && !contains(reached_by_fixed_tops, v)) {
         /* can't do this for infix/suffixes unless we know trigger literals
          * can only occur at one offset */
-        DEBUG_PRINTF("bad top(s) for %u\n", g[v].index);
+        DEBUG_PRINTF("bad top(s) for %zu\n", g[v].index);
         return false;
     }
 
@@ -1104,8 +1102,8 @@ bool entered_at_fixed_offset(NFAVertex v, const NGHolder &g,
 
     for (auto u : inv_adjacent_vertices_range(v, g)) {
         const depth &u_max_depth = depths.at(u).fromStart.max;
-        DEBUG_PRINTF("pred %u max depth %s from start\n",
-                     g[u].index, u_max_depth.str().c_str());
+        DEBUG_PRINTF("pred %zu max depth %s from start\n", g[u].index,
+                     u_max_depth.str().c_str());
         if (u_max_depth != first - depth(1)) {
             return false;
         }
@@ -1123,12 +1121,12 @@ NFAVertex buildTriggerStates(NGHolder &g, const vector<CharReach> &trigger,
         g[v].char_reach = cr;
         add_edge(u, v, g);
         if (u == g.start) {
-            g[edge(u, v, g).first].top = top;
+            g[edge(u, v, g)].tops.insert(top);
         }
         u = v;
     }
 
-    DEBUG_PRINTF("trigger len=%zu has sink %u\n", trigger.size(), g[u].index);
+    DEBUG_PRINTF("trigger len=%zu has sink %zu\n", trigger.size(), g[u].index);
     return u;
 }
 
@@ -1153,18 +1151,21 @@ void addTriggers(NGHolder &g,
             continue;
         }
 
-        const auto &top = g[e].top;
+        const auto &tops = g[e].tops;
 
         // The caller may not have given us complete trigger information. If we
         // don't have any triggers for a particular top, we should just leave
         // it alone.
-        if (!contains(triggers, top)) {
-            DEBUG_PRINTF("no triggers for top %u\n", top);
-            continue;
-        }
+        for (u32 top : tops) {
+            if (!contains(triggers, top)) {
+                DEBUG_PRINTF("no triggers for top %u\n", top);
+                goto next_edge;
+            }
 
-        starts_by_top[top].push_back(v);
+            starts_by_top[top].push_back(v);
+        }
         dead.push_back(e);
+    next_edge:;
     }
 
     remove_edges(dead, g);
@@ -1255,7 +1256,7 @@ void buildRepeatGraph(NGHolder &rg,
     if (is_triggered(rg)) {
         // Add vertices for all our triggers
         addTriggers(rg, triggers);
-        rg.renumberVertices();
+        renumber_vertices(rg);
 
         // We don't know anything about how often this graph is triggered, so we
         // make the start vertex cyclic for the purposes of this analysis ONLY.
@@ -1277,30 +1278,26 @@ void buildInputGraph(NGHolder &lhs,
                      ue2::unordered_map<NFAVertex, NFAVertex> &lhs_map,
                      const NGHolder &g, const NFAVertex first,
                      const map<u32, vector<vector<CharReach>>> &triggers) {
-    DEBUG_PRINTF("building lhs with first=%u\n", g[first].index);
+    DEBUG_PRINTF("building lhs with first=%zu\n", g[first].index);
     cloneHolder(lhs, g, &lhs_map);
     assert(g.kind == lhs.kind);
     addTriggers(lhs, triggers);
-    lhs.renumberVertices();
+    renumber_vertices(lhs);
 
     // Replace each back-edge (u,v) with an edge (startDs,v), which will
     // generate entries at at least the rate of the loop created by that
     // back-edge.
     set<NFAEdge> dead;
     BackEdges<set<NFAEdge> > backEdgeVisitor(dead);
-    depth_first_search(
-        lhs.g, visitor(backEdgeVisitor)
-                   .root_vertex(lhs.start)
-                   .vertex_index_map(get(&NFAGraphVertexProps::index, lhs.g)));
+    depth_first_search(lhs, visitor(backEdgeVisitor).root_vertex(lhs.start));
     for (const auto &e : dead) {
         const NFAVertex u = source(e, lhs), v = target(e, lhs);
         if (u == v) {
             continue; // Self-loops are OK.
         }
 
-        DEBUG_PRINTF("replacing back-edge (%u,%u) with edge (startDs,%u)\n",
-                     lhs[u].index, lhs[v].index,
-                     lhs[v].index);
+        DEBUG_PRINTF("replacing back-edge (%zu,%zu) with edge (startDs,%zu)\n",
+                     lhs[u].index, lhs[v].index, lhs[v].index);
 
         add_edge_if_not_present(lhs.startDs, v, lhs);
         remove_edge(e, lhs);
@@ -1387,13 +1384,13 @@ bool hasSoleEntry(const NGHolder &g, const ReachSubgraph &rsi,
     for (const auto &v : rsi.vertices) {
         assert(!is_special(v, g)); // no specials in repeats
         assert(contains(rg_map, v));
-        DEBUG_PRINTF("rg vertex %u in repeat\n", rg[rg_map.at(v)].index);
+        DEBUG_PRINTF("rg vertex %zu in repeat\n", rg[rg_map.at(v)].index);
         region_map.emplace(rg_map.at(v), repeat_region);
     }
 
     for (const auto &v : vertices_range(rg)) {
         if (!contains(region_map, v)) {
-            DEBUG_PRINTF("rg vertex %u in lhs (trigger)\n", rg[v].index);
+            DEBUG_PRINTF("rg vertex %zu in lhs (trigger)\n", rg[v].index);
             region_map.emplace(v, lhs_region);
         }
     }
@@ -1435,7 +1432,7 @@ struct StrawWalker {
         if (next == v) { // Ignore self loop.
             ++ai;
             if (ai == ae) {
-                return NFAGraph::null_vertex();
+                return NGHolder::null_vertex();
             }
             next = *ai;
         }
@@ -1450,7 +1447,7 @@ struct StrawWalker {
             succs.erase(v);
             for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
                 next = *ai;
-                DEBUG_PRINTF("checking %u\n", g[next].index);
+                DEBUG_PRINTF("checking %zu\n", g[next].index);
                 if (next == v) {
                     continue;
                 }
@@ -1471,32 +1468,31 @@ struct StrawWalker {
                 return next;
             }
             DEBUG_PRINTF("bailing\n");
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
         return next;
     }
 
     NFAVertex walk(NFAVertex v, vector<NFAVertex> &straw) const {
-        DEBUG_PRINTF("walk from %u\n", g[v].index);
+        DEBUG_PRINTF("walk from %zu\n", g[v].index);
         ue2::unordered_set<NFAVertex> visited;
         straw.clear();
 
         while (!is_special(v, g)) {
-            DEBUG_PRINTF("checking %u\n", g[v].index);
+            DEBUG_PRINTF("checking %zu\n", g[v].index);
             NFAVertex next = step(v);
-            if (next == NFAGraph::null_vertex()) {
+            if (next == NGHolder::null_vertex()) {
                 break;
             }
             if (!visited.insert(next).second) {
-                DEBUG_PRINTF("already visited %u, bailing\n",
-                             g[next].index);
+                DEBUG_PRINTF("already visited %zu, bailing\n", g[next].index);
                 break; /* don't want to get stuck in any complicated loops */
             }
 
             const CharReach &reach_v = g[v].char_reach;
             const CharReach &reach_next = g[next].char_reach;
             if (!reach_v.isSubsetOf(reach_next)) {
-                DEBUG_PRINTF("%u's reach is not a superset of %u's\n",
+                DEBUG_PRINTF("%zu's reach is not a superset of %zu's\n",
                              g[next].index, g[v].index);
                 break;
             }
@@ -1504,7 +1500,7 @@ struct StrawWalker {
             // If this is cyclic with the right reach, we're done. Note that
             // startDs fulfils this requirement.
             if (hasSelfLoop(next, g) && !isBoundedRepeatCyclic(next)) {
-                DEBUG_PRINTF("found cyclic %u\n", g[next].index);
+                DEBUG_PRINTF("found cyclic %zu\n", g[next].index);
                 return next;
             }
 
@@ -1513,7 +1509,7 @@ struct StrawWalker {
         }
 
         straw.clear();
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
 private:
@@ -1528,8 +1524,8 @@ static
 NFAVertex walkStrawToCyclicRev(const NGHolder &g, NFAVertex v,
                                const vector<BoundedRepeatData> &all_repeats,
                                vector<NFAVertex> &straw) {
-    typedef boost::reverse_graph<NFAGraph, const NFAGraph&> RevGraph;
-    const RevGraph revg(g.g);
+    typedef boost::reverse_graph<NGHolder, const NGHolder &> RevGraph;
+    const RevGraph revg(g);
 
     auto cyclic = StrawWalker<RevGraph>(g, revg, all_repeats).walk(v, straw);
     reverse(begin(straw), end(straw)); // path comes from cyclic
@@ -1540,7 +1536,7 @@ static
 NFAVertex walkStrawToCyclicFwd(const NGHolder &g, NFAVertex v,
                                const vector<BoundedRepeatData> &all_repeats,
                                vector<NFAVertex> &straw) {
-    return StrawWalker<NFAGraph>(g, g.g, all_repeats).walk(v, straw);
+    return StrawWalker<NGHolder>(g, g, all_repeats).walk(v, straw);
 }
 
 /** True if entries to this subgraph must pass through a cyclic state with
@@ -1556,7 +1552,7 @@ bool hasCyclicSupersetEntryPath(const NGHolder &g, const ReachSubgraph &rsi,
     // until we encounter our cyclic, all of which must have superset reach.
     vector<NFAVertex> straw;
     return walkStrawToCyclicRev(g, rsi.vertices.front(), all_repeats, straw) !=
-           NFAGraph::null_vertex();
+           NGHolder::null_vertex();
 }
 
 static
@@ -1564,7 +1560,7 @@ bool hasCyclicSupersetExitPath(const NGHolder &g, const ReachSubgraph &rsi,
                                const vector<BoundedRepeatData> &all_repeats) {
     vector<NFAVertex> straw;
     return walkStrawToCyclicFwd(g, rsi.vertices.back(), all_repeats, straw) !=
-           NFAGraph::null_vertex();
+           NGHolder::null_vertex();
 }
 
 static
@@ -1847,7 +1843,7 @@ void buildFeeder(NGHolder &g, const BoundedRepeatData &rd,
             add_edge(u, feeder, g);
         }
 
-        DEBUG_PRINTF("added feeder %u\n", g[feeder].index);
+        DEBUG_PRINTF("added feeder %zu\n", g[feeder].index);
     } else {
         // No neg trigger means feeder is empty, and unnecessary.
         assert(g[rd.pos_trigger].char_reach.all());
@@ -1895,13 +1891,13 @@ bool improveLeadingRepeat(NGHolder &g, BoundedRepeatData &rd,
     // This transformation is only safe if the straw path from startDs that
     // we've discovered can *only* lead to this repeat, since we're going to
     // remove the self-loop on startDs.
-    if (hasGreaterOutDegree(2, g.startDs, g)) {
+    if (proper_out_degree(g.startDs, g) > 1) {
         DEBUG_PRINTF("startDs has other successors\n");
         return false;
     }
     for (const auto &v : straw) {
         if (proper_out_degree(v, g) != 1) {
-            DEBUG_PRINTF("branch between startDs and repeat, from vertex %u\n",
+            DEBUG_PRINTF("branch between startDs and repeat, from vertex %zu\n",
                          g[v].index);
             return false;
         }
@@ -2071,8 +2067,8 @@ public:
                  const depth &our_depth_in)
         : top_depths(top_depths_in), our_depth(our_depth_in) {}
 
-    void discover_vertex(NFAVertex v, UNUSED const NFAGraph &g) {
-        DEBUG_PRINTF("discovered %u (depth %s)\n", g[v].index,
+    void discover_vertex(NFAVertex v, UNUSED const NGHolder &g) {
+        DEBUG_PRINTF("discovered %zu (depth %s)\n", g[v].index,
                      our_depth.str().c_str());
 
         auto it = top_depths.find(v);
@@ -2105,28 +2101,39 @@ void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
         if (v == g.startDs) {
             continue;
         }
-        u32 top = g[e].top;
+
         depth td = depth::infinity();
-        if (contains(fixed_depth_tops, top)) {
-            td = fixed_depth_tops.at(top);
+        for (u32 top : g[e].tops) {
+            if (!contains(fixed_depth_tops, top)) {
+                td = depth::infinity();
+                break;
+            }
+            depth td_t = fixed_depth_tops.at(top);
+            if (td == td_t) {
+                continue;
+            } else if (td == depth::infinity()) {
+                td = td_t;
+            } else {
+                td = depth::infinity();
+                break;
+            }
         }
 
-        DEBUG_PRINTF("scanning from %u top=%u depth=%s\n",
-                     g[v].index, top, td.str().c_str());
+        DEBUG_PRINTF("scanning from %zu depth=%s\n", g[v].index,
+                     td.str().c_str());
         /* for each vertex reachable from v update its map to reflect that it is
          * reachable from a top of depth td. */
 
-        depth_first_visit(
-            g.g, v, pfti_visitor(top_depths, td),
-            make_iterator_property_map(colours.begin(),
-                                       get(&NFAGraphVertexProps::index, g.g)));
+        depth_first_visit(g, v, pfti_visitor(top_depths, td),
+                          make_iterator_property_map(colours.begin(),
+                                                     get(vertex_index, g)));
     }
 
     for (const auto &v_depth : top_depths) {
         const NFAVertex v = v_depth.first;
         const depth &d = v_depth.second;
         if (d.is_finite()) {
-            DEBUG_PRINTF("%u reached by fixed tops at depth %s\n",
+            DEBUG_PRINTF("%zu reached by fixed tops at depth %s\n",
                          g[v].index, d.str().c_str());
             reached_by_fixed_tops->insert(v);
         }
@@ -2143,19 +2150,16 @@ bool hasOverlappingRepeats(UNUSED const NGHolder &g,
 
     for (const auto &br : repeats) {
         if (contains(involved, br.cyclic)) {
-            DEBUG_PRINTF("already seen cyclic %u\n",
-                         g[br.cyclic].index);
+            DEBUG_PRINTF("already seen cyclic %zu\n", g[br.cyclic].index);
             return true;
         }
         if (contains(involved, br.pos_trigger)) {
-            DEBUG_PRINTF("already seen pos %u\n",
-                         g[br.pos_trigger].index);
+            DEBUG_PRINTF("already seen pos %zu\n", g[br.pos_trigger].index);
             return true;
         }
         for (auto v : br.tug_triggers) {
             if (contains(involved, v)) {
-                DEBUG_PRINTF("already seen tug %u\n",
-                             g[v].index);
+                DEBUG_PRINTF("already seen tug %zu\n", g[v].index);
                 return true;
             }
         }
@@ -2301,7 +2305,7 @@ void analyseRepeats(NGHolder &g, const ReportManager *rm,
     // Go to town on the remaining acceptable subgraphs.
     ue2::unordered_set<NFAVertex> created;
     for (auto &rsi : rs) {
-        DEBUG_PRINTF("subgraph (beginning vertex %u) is a {%s,%s} repeat\n",
+        DEBUG_PRINTF("subgraph (beginning vertex %zu) is a {%s,%s} repeat\n",
                      g[rsi.vertices.front()].index,
                      rsi.repeatMin.str().c_str(), rsi.repeatMax.str().c_str());
 
@@ -2334,7 +2338,7 @@ void analyseRepeats(NGHolder &g, const ReportManager *rm,
 
         // Some of our analyses require correctly numbered vertices, so we
         // renumber after changes.
-        g.renumberVertices();
+        renumber_vertices(g);
     }
 
     bool modified_start_ds = false;
@@ -2375,8 +2379,8 @@ void analyseRepeats(NGHolder &g, const ReportManager *rm,
 
         // We have modified the graph, so we need to ensure that our edges
         // and vertices are correctly numbered.
-        g.renumberVertices();
-        g.renumberEdges();
+        renumber_vertices(g);
+        renumber_edges(g);
         // Remove stray report IDs.
         clearReports(g);
     }
@@ -2415,20 +2419,20 @@ bool isPureRepeat(const NGHolder &g, PureRepeat &repeat) {
 
     // Must be start anchored.
     assert(edge(g.startDs, g.startDs, g).second);
-    if (hasGreaterOutDegree(1, g.startDs, g)) {
+    if (out_degree(g.startDs, g) > 1) {
         DEBUG_PRINTF("Unanchored\n");
         return false;
     }
 
     // Must not be EOD-anchored.
     assert(edge(g.accept, g.acceptEod, g).second);
-    if (hasGreaterInDegree(1, g.acceptEod, g)) {
+    if (in_degree(g.acceptEod, g) > 1) {
         DEBUG_PRINTF("EOD anchored\n");
         return false;
     }
 
     // Must have precisely one top.
-    if (!onlyOneTop(g)) {
+    if (is_triggered(g) && !onlyOneTop(g)) {
         DEBUG_PRINTF("Too many tops\n");
         return false;
     }
diff --git a/src/nfagraph/ng_restructuring.cpp b/src/nfagraph/ng_restructuring.cpp
index 09abf775..32cdac23 100644
--- a/src/nfagraph/ng_restructuring.cpp
+++ b/src/nfagraph/ng_restructuring.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,37 +49,71 @@ namespace ue2 {
 /** Connect the start vertex to each of the vertices in \p tops. This is useful
  * temporarily for when we need to run a graph algorithm that expects a single
  * source vertex. */
-void wireStartToTops(NGHolder &g, const map<u32, NFAVertex> &tops,
-                     vector<NFAEdge> &topEdges) {
-    for (const auto &top : tops) {
-        NFAVertex v = top.second;
+static
+void wireStartToTops(NGHolder &g, const flat_set<NFAVertex> &tops,
+                     vector<NFAEdge> &tempEdges) {
+    for (NFAVertex v : tops) {
         assert(!isLeafNode(v, g));
 
-        const NFAEdge &e = add_edge(g.start, v, g).first;
-        topEdges.push_back(e);
+        const NFAEdge &e = add_edge(g.start, v, g);
+        tempEdges.push_back(e);
     }
 }
 
+/**
+ * Returns true if start's successors (aside from startDs) are subset of
+ * startDs's proper successors or if start has no successors other than startDs.
+ */
 static
-void getStateOrdering(NGHolder &g, const map<u32, NFAVertex> &tops,
+bool startIsRedundant(const NGHolder &g) {
+    /* We ignore startDs as the self-loop may have been stripped as an
+     * optimisation for repeats (improveLeadingRepeats()). */
+    set<NFAVertex> start;
+    insert(&start,  adjacent_vertices_range(g.start, g));
+    start.erase(g.startDs);
+
+    // Trivial case: start has no successors other than startDs.
+    if (start.empty()) {
+        DEBUG_PRINTF("start has no out-edges other than to startDs\n");
+        return true;
+    }
+
+    set<NFAVertex> startDs;
+    insert(&startDs,  adjacent_vertices_range(g.startDs, g));
+    startDs.erase(g.startDs);
+
+    if (!is_subset_of(start, startDs)) {
+        DEBUG_PRINTF("out-edges of start and startDs aren't equivalent\n");
+        return false;
+    }
+
+    return true;
+}
+
+static
+void getStateOrdering(NGHolder &g, const flat_set<NFAVertex> &tops,
                       vector<NFAVertex> &ordering) {
     // First, wire up our "tops" to start so that we have a single source,
     // which will give a nicer topo order.
-    vector<NFAEdge> topEdges;
-    wireStartToTops(g, tops, topEdges);
+    vector<NFAEdge> tempEdges;
+    wireStartToTops(g, tops, tempEdges);
 
-    renumberGraphVertices(g);
+    renumber_vertices(g);
 
     vector<NFAVertex> temp = getTopoOrdering(g);
 
-    remove_edges(topEdges, g);
+    remove_edges(tempEdges, g);
 
     // Move {start, startDs} to the end, so they'll be first when we reverse
-    // the ordering.
+    // the ordering (if they are required).
     temp.erase(remove(temp.begin(), temp.end(), g.startDs));
     temp.erase(remove(temp.begin(), temp.end(), g.start));
-    temp.push_back(g.startDs);
-    temp.push_back(g.start);
+    if (proper_out_degree(g.startDs, g)) {
+        temp.push_back(g.startDs);
+    }
+    if (!startIsRedundant(g)) {
+        temp.push_back(g.start);
+    }
 
     // Walk ordering, remove vertices that shouldn't be participating in state
     // numbering, such as accepts.
@@ -106,7 +140,7 @@ getStateIndices(const NGHolder &h, const vector<NFAVertex> &ordering) {
 
     u32 stateNum = 0;
     for (auto v : ordering) {
-        DEBUG_PRINTF("assigning state num %u to vertex %u\n", stateNum,
+        DEBUG_PRINTF("assigning state num %u to vertex %zu\n", stateNum,
                      h[v].index);
         states[v] = stateNum++;
     }
@@ -149,16 +183,15 @@ void optimiseTightLoops(const NGHolder &g, vector<NFAVertex> &ordering) {
             continue;
         }
 
-        DEBUG_PRINTF("moving vertex %u next to %u\n",
-                     g[v].index, g[u].index);
+        DEBUG_PRINTF("moving vertex %zu next to %zu\n", g[v].index, g[u].index);
 
         ordering.erase(v_it);
         ordering.insert(++u_it, v);
     }
 }
 
-ue2::unordered_map<NFAVertex, u32>
-numberStates(NGHolder &h, const map<u32, NFAVertex> &tops) {
+unordered_map<NFAVertex, u32>
+numberStates(NGHolder &h, const flat_set<NFAVertex> &tops) {
     DEBUG_PRINTF("numbering states for holder %p\n", &h);
 
     vector<NFAVertex> ordering;
@@ -166,14 +199,10 @@ numberStates(NGHolder &h, const map<u32, NFAVertex> &tops) {
 
     optimiseTightLoops(h, ordering);
 
-    ue2::unordered_map<NFAVertex, u32> states = getStateIndices(h, ordering);
-
-    return states;
+    return getStateIndices(h, ordering);
 }
 
-u32 countStates(const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                bool addTops) {
+u32 countStates(const unordered_map<NFAVertex, u32> &state_ids) {
     if (state_ids.empty()) {
         return 0;
     }
@@ -184,168 +213,9 @@ u32 countStates(const NGHolder &g,
             max_state = max(m.second, max_state);
         }
     }
-
     u32 num_states = max_state + 1;
 
-    assert(contains(state_ids, g.start));
-    if (addTops && state_ids.at(g.start) != NO_STATE) {
-        num_states--;
-        set<u32> tops;
-        for (auto e : out_edges_range(g.start, g)) {
-            tops.insert(g[e].top);
-        }
-        num_states += tops.size();
-    }
-
     return num_states;
 }
 
-/**
- * Returns true if start leads to all of startDs's proper successors or if
- * start has no successors other than startDs.
- */
-static
-bool startIsRedundant(const NGHolder &g) {
-    set<NFAVertex> start, startDs;
-
-    for (const auto &e : out_edges_range(g.start, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        start.insert(v);
-    }
-
-    for (const auto &e : out_edges_range(g.startDs, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        startDs.insert(v);
-    }
-
-    // Trivial case: start has no successors other than startDs.
-    if (start.empty()) {
-        DEBUG_PRINTF("start has no out-edges other than to startDs\n");
-        return true;
-    }
-
-    if (start != startDs) {
-        DEBUG_PRINTF("out-edges of start and startDs aren't equivalent\n");
-        return false;
-    }
-
-    return true;
-}
-
-/** One final, FINAL optimisation. Drop either start or startDs if it's unused
- * in this graph. We leave this until this late because having both vertices in
- * the graph, with fixed state indices, is useful for merging and other
- * analyses. */
-void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states) {
-    u32 adj = 0;
-
-    if (startIsRedundant(g)) {
-        DEBUG_PRINTF("dropping unused start\n");
-        states[g.start] = NO_STATE;
-        adj++;
-    }
-
-    if (proper_out_degree(g.startDs, g) == 0) {
-        DEBUG_PRINTF("dropping unused startDs\n");
-        states[g.startDs] = NO_STATE;
-        adj++;
-    }
-
-    if (!adj) {
-        DEBUG_PRINTF("both start and startDs must remain\n");
-        return;
-    }
-
-    // We have removed one or both of the starts. Walk the non-special vertices
-    // in the graph with state indices assigned to them and subtract
-    // adj from all of them.
-    for (auto v : vertices_range(g)) {
-        u32 &state = states[v]; // note ref
-        if (state == NO_STATE) {
-            continue;
-        }
-        if (is_any_start(v, g)) {
-            assert(state <= 1);
-            state = 0; // one start remains
-        } else {
-            assert(!is_special(v, g));
-            assert(state >= adj);
-            state -= adj;
-        }
-    }
-}
-
-flat_set<NFAVertex> findUnusedStates(const NGHolder &g) {
-    flat_set<NFAVertex> dead;
-    if (startIsRedundant(g)) {
-        dead.insert(g.start);
-    }
-    if (proper_out_degree(g.startDs, g) == 0) {
-        dead.insert(g.startDs);
-    }
-    return dead;
-}
-
-/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
- * accepts. */
-void reverseHolder(const NGHolder &g_in, NGHolder &g) {
-    // Make the BGL do the grunt work.
-    ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
-    boost::transpose_graph(g_in.g, g.g,
-                orig_to_copy(boost::make_assoc_property_map(vertexMap)).
-                vertex_index_map(get(&NFAGraphVertexProps::index, g_in.g)));
-
-    // The transpose_graph operation will have created extra copies of our
-    // specials. We have to rewire their neighbours to the 'real' specials and
-    // delete them.
-    NFAVertex start = vertexMap[g_in.acceptEod];
-    NFAVertex startDs = vertexMap[g_in.accept];
-    NFAVertex accept = vertexMap[g_in.startDs];
-    NFAVertex acceptEod = vertexMap[g_in.start];
-
-    // Successors of starts.
-    for (const auto &e : out_edges_range(start, g)) {
-        NFAVertex v = target(e, g);
-        add_edge(g.start, v, g[e], g);
-    }
-    for (const auto &e : out_edges_range(startDs, g)) {
-        NFAVertex v = target(e, g);
-        add_edge(g.startDs, v, g[e], g);
-    }
-
-    // Predecessors of accepts.
-    for (const auto &e : in_edges_range(accept, g)) {
-        NFAVertex u = source(e, g);
-        add_edge(u, g.accept, g[e], g);
-    }
-    for (const auto &e : in_edges_range(acceptEod, g)) {
-        NFAVertex u = source(e, g);
-        add_edge(u, g.acceptEod, g[e], g);
-    }
-
-    // Remove our impostors.
-    clear_vertex(start, g);
-    remove_vertex(start, g);
-    clear_vertex(startDs, g);
-    remove_vertex(startDs, g);
-    clear_vertex(accept, g);
-    remove_vertex(accept, g);
-    clear_vertex(acceptEod, g);
-    remove_vertex(acceptEod, g);
-
-    // Renumber so that g's properties (number of vertices, edges) are
-    // accurate.
-    g.renumberVertices();
-    g.renumberEdges();
-
-    assert(num_vertices(g) == num_vertices(g_in));
-    assert(num_edges(g) == num_edges(g_in));
-}
-
 } // namespace ue2
diff --git a/src/nfagraph/ng_restructuring.h b/src/nfagraph/ng_restructuring.h
index 5e244bf6..bbd478d5 100644
--- a/src/nfagraph/ng_restructuring.h
+++ b/src/nfagraph/ng_restructuring.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,23 +37,8 @@
 #include "ue2common.h"
 #include "util/ue2_containers.h"
 
-#include <map>
-#include <vector>
-
 namespace ue2 {
 
-class NGHolder;
-
-/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
- * accepts. */
-void reverseHolder(const NGHolder &g, NGHolder &out);
-
-/** Connect the start vertex to each of the vertices in \p tops. This is useful
- * temporarily for when we need to run a graph algorithm that expects a single
- * source vertex. */
-void wireStartToTops(NGHolder &g, const std::map<u32, NFAVertex> &tops,
-                     std::vector<NFAEdge> &topEdges);
-
 /**
  * \brief Special state index value meaning that the vertex will not
  * participate in an (NFA/DFA/etc) implementation.
@@ -63,30 +48,14 @@ static constexpr u32 NO_STATE = ~0;
 /**
  * \brief Gives each participating vertex in the graph a unique state index.
  */
-ue2::unordered_map<NFAVertex, u32>
-numberStates(NGHolder &h,
-             const std::map<u32, NFAVertex> &tops = std::map<u32, NFAVertex>{});
+unordered_map<NFAVertex, u32>
+numberStates(NGHolder &h, const flat_set<NFAVertex> &tops);
 
 /**
  * \brief Counts the number of states (vertices with state indices) in the
  * graph.
- *
- * If addTops is true, also accounts for states that will be constructed for
- * each unique top.
  */
-u32 countStates(const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                bool addTops = true);
-
-/** Optimisation: drop unnecessary start states. */
-void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states);
-
-/**
- * \brief Returns a set of vertices that will not participate in an
- * implementation (NFA, DFA etc) of this graph. For example, starts with no
- * successors.
- */
-flat_set<NFAVertex> findUnusedStates(const NGHolder &g);
+u32 countStates(const unordered_map<NFAVertex, u32> &state_ids);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 137ac5cc..7066ab27 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -68,8 +68,6 @@
 #include <utility>
 #include <vector>
 #include <boost/core/noncopyable.hpp>
-#include <boost/graph/reverse_graph.hpp>
-#include <boost/graph/topological_sort.hpp>
 
 #define NDEBUG_PRINTF(x, ...) \
     do { if (0) { DEBUG_PRINTF(x,  ## __VA_ARGS__); } } while (0)
@@ -540,7 +538,7 @@ void getRegionRoseLiterals(const NGHolder &g,
         DEBUG_PRINTF("inspecting region %u\n", region);
         set<ue2_literal> s;
         for (auto v : vv) {
-            DEBUG_PRINTF("   exit vertex: %u\n", g[v].index);
+            DEBUG_PRINTF("   exit vertex: %zu\n", g[v].index);
             /* Note: RHS can not be depended on to take all subsequent revisits
              * to this vertex */
             set<ue2_literal> ss = getLiteralSet(g, v, false);
@@ -575,8 +573,7 @@ void gatherBackEdges(const NGHolder &g,
                      ue2::unordered_map<NFAVertex, vector<NFAVertex>> *out) {
     set<NFAEdge> backEdges;
     BackEdges<set<NFAEdge>> be(backEdges);
-    depth_first_search(g.g, visitor(be).root_vertex(g.start).vertex_index_map(
-                                get(&NFAGraphVertexProps::index, g.g)));
+    depth_first_search(g, visitor(be).root_vertex(g.start));
 
     for (const auto &e : backEdges) {
         (*out)[source(e, g)].push_back(target(e, g));
@@ -759,7 +756,7 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
         unique_ptr<VertLitInfo> rv = move(lits.back());
         lits.pop_back();
         poisonCandidates(*rv);
-        DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
+        DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
                      dumpString(*(rv->lit.begin())).c_str(),
                      g[rv->vv.front()].index,
                      (int)createsAnchoredLHS(g, rv->vv, depths, grey),
@@ -811,6 +808,7 @@ bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
 
 u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
                                 u32 max_delay, bool overhang_ok) {
+    assert(isCorrectlyTopped(g));
     if (max_delay == MO_INVALID_IDX) {
         max_delay--;
     }
@@ -864,8 +862,6 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     assert(delay <= lit.length());
     DEBUG_PRINTF("managed delay %u (of max %u)\n", delay, max_delay);
 
-    // For determinism, we make sure that we create these edges from vertices
-    // in index-sorted order.
     set<NFAVertex> pred;
     for (auto v : curr) {
         insert(&pred, inv_adjacent_vertices_range(v, g));
@@ -874,16 +870,17 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     clear_in_edges(g.accept, g);
     clearReports(g);
 
-    vector<NFAVertex> verts(pred.begin(), pred.end());
-    sort(verts.begin(), verts.end(), VertexIndexOrdering<NGHolder>(g));
-
-    for (auto v : verts) {
-        add_edge(v, g.accept, g);
+    for (auto v : pred) {
+        NFAEdge e = add_edge(v, g.accept, g);
         g[v].reports.insert(0);
+        if (is_triggered(g) && v == g.start) {
+            g[e].tops.insert(DEFAULT_TOP);
+        }
     }
 
     pruneUseless(g);
     assert(allMatchStatesHaveReports(g));
+    assert(isCorrectlyTopped(g));
 
     DEBUG_PRINTF("graph has %zu vertices left\n", num_vertices(g));
     return delay;
@@ -892,6 +889,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
 void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
                                   u32 delay, const vector<NFAVertex> &preds) {
     assert(delay <= lit.length());
+    assert(isCorrectlyTopped(g));
     DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
 
     NFAVertex prev = g.accept;
@@ -906,7 +904,10 @@ void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     }
 
     for (auto v : preds) {
-        add_edge(v, prev, g);
+        NFAEdge e = add_edge(v, prev, g);
+        if (v == g.start && is_triggered(g)) {
+            g[e].tops.insert(DEFAULT_TOP);
+        }
     }
 
     // Every predecessor of accept must have a report.
@@ -914,9 +915,10 @@ void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
         g[u].reports.insert(0);
     }
 
-    g.renumberVertices();
-    g.renumberEdges();
+    renumber_vertices(g);
+    renumber_edges(g);
     assert(allMatchStatesHaveReports(g));
+    assert(isCorrectlyTopped(g));
 }
 
 void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
@@ -1144,7 +1146,7 @@ void deanchorIfNeeded(NGHolder &g, bool *orig_anch) {
     succ_g.erase(g.startDs);
 
     for (auto v : adjacent_vertices_range(g.start, g)) {
-        DEBUG_PRINTF("inspecting cand %u || =%zu\n", g[v].index,
+        DEBUG_PRINTF("inspecting cand %zu || =%zu\n", g[v].index,
                      g[v].char_reach.size());
 
         if (v == g.startDs || !g[v].char_reach.all()) {
@@ -1162,7 +1164,7 @@ void deanchorIfNeeded(NGHolder &g, bool *orig_anch) {
             }
             clear_vertex(v, g);
             remove_vertex(v, g);
-            g.renumberVertices();
+            renumber_vertices(g);
             return;
         }
 
@@ -1693,7 +1695,7 @@ void splitEdgesByCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
             /* TODO need to update v_mapping (if we were doing more cuts) */
         }
 
-        DEBUG_PRINTF("splitting on pivot %u\n", h[pivot].index);
+        DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
         ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
         shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
         splitLHS(h, pivot, new_lhs.get(), &temp_map);
@@ -1766,8 +1768,8 @@ bool doNetflowCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
         return false;
     }
 
-    h.renumberVertices();
-    h.renumberEdges();
+    renumber_vertices(h);
+    renumber_edges(h);
     /* Step 1: Get scores for all edges */
     vector<u64a> scores = scoreEdges(h); /* scores by edge_index */
     /* Step 2: poison scores for edges covered by successor literal */
@@ -2366,12 +2368,17 @@ void makeNocaseWithPrefixMask(RoseInGraph &g, RoseInVertex v) {
 
                 h[ds].char_reach = CharReach::dot();
 
-                add_edge(h.start, ds, h);
+                NFAEdge e_start_to_ds = add_edge(h.start, ds, h);
                 add_edge(ds, ds, h);
                 add_edge(ds, h.accept, h);
                 h[h.start].reports.insert(0);
                 h[ds].reports.insert(0);
+
+                if (g[u].type == RIV_LITERAL) {
+                    h[e_start_to_ds].tops.insert(DEFAULT_TOP);
+                }
             } else {
+                assert(g[u].type == RIV_ANCHORED_START);
                 add_edge(h.start, h.accept, h);
                 h[h.start].reports.insert(0);
             }
@@ -2406,14 +2413,14 @@ void explodeLiteral(RoseInGraph &g, RoseInVertex v,
         g[v_new].s = lit;
 
         for (const auto &e : in_edges_range(v, g)) {
-            RoseInEdge e2 = add_edge(source(e, g), v_new, g[e], g).first;
+            RoseInEdge e2 = add_edge(source(e, g), v_new, g[e], g);
             // FIXME: are we safe to share graphs here? For now, make our very
             // own copy.
             g[e2].graph = makeGraphCopy(g[e].graph.get());
         }
 
         for (const auto &e : out_edges_range(v, g)) {
-            RoseInEdge e2 = add_edge(v_new, target(e, g), g[e], g).first;
+            RoseInEdge e2 = add_edge(v_new, target(e, g), g[e], g);
             // FIXME: are we safe to share graphs here? For now, make our very
             // own copy.
             g[e2].graph = makeGraphCopy(g[e].graph.get());
@@ -2565,7 +2572,7 @@ bool followedByStar(const vector<NFAVertex> &vv, const NGHolder &g) {
 
 static
 bool isEodPrefixCandidate(const NGHolder &g) {
-    if (hasGreaterInDegree(0, g.accept, g)) {
+    if (in_degree(g.accept, g)) {
         DEBUG_PRINTF("graph isn't eod anchored\n");
         return false;
     }
@@ -2636,7 +2643,7 @@ void processEodPrefixes(RoseInGraph &g) {
         }
 
         // TODO: handle cases with multiple out-edges.
-        if (hasGreaterOutDegree(1, source(e, g), g)) {
+        if (out_degree(source(e, g), g) > 1) {
             continue;
         }
 
@@ -2663,7 +2670,7 @@ void processEodPrefixes(RoseInGraph &g) {
     }
 
     for (auto v : accepts) {
-        if (!hasGreaterInDegree(0, v, g)) {
+        if (!in_degree(v, g)) {
             remove_vertex(v, g);
         }
     }
@@ -2805,6 +2812,7 @@ unique_ptr<RoseInGraph> buildRose(const NGHolder &h, bool desperation,
 
     dumpPreRoseGraph(ig, cc.grey);
 
+    renumber_vertices(ig);
     calcVertexOffsets(ig);
     return igp;
 }
@@ -2821,6 +2829,7 @@ void desperationImprove(RoseInGraph &ig, const CompileContext &cc) {
     handleLongMixedSensitivityLiterals(ig);
     dedupe(ig);
     pruneUseless(ig);
+    renumber_vertices(ig);
     calcVertexOffsets(ig);
 }
 
@@ -2831,8 +2840,7 @@ bool splitOffRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
     }
 
     // We should have at least one edge into accept or acceptEod!
-    assert(hasGreaterInDegree(0, h.accept, h) ||
-           hasGreaterInDegree(1, h.acceptEod, h));
+    assert(in_degree(h.accept, h) || in_degree(h.acceptEod, h) > 1);
 
     unique_ptr<RoseInGraph> igp = buildRose(h, false, cc);
     if (igp && rose.addRose(*igp, prefilter)) {
@@ -2924,6 +2932,7 @@ bool finalChanceRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
         add_edge(v, a, RoseInEdgeProps(rhs, 0U), ig);
     }
 
+    renumber_vertices(ig);
     calcVertexOffsets(ig);
 
     return rose.addRose(ig, prefilter, true /* final chance */);
@@ -2936,8 +2945,7 @@ bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
     }
 
     // We should have at least one edge into accept or acceptEod!
-    assert(hasGreaterInDegree(0, h.accept, h) ||
-           hasGreaterInDegree(1, h.acceptEod, h));
+    assert(in_degree(h.accept, h) || in_degree(h.acceptEod, h) > 1);
 
     unique_ptr<RoseInGraph> igp;
 
diff --git a/src/nfagraph/ng_small_literal_set.cpp b/src/nfagraph/ng_small_literal_set.cpp
index b5867bb9..1d7be65b 100644
--- a/src/nfagraph/ng_small_literal_set.cpp
+++ b/src/nfagraph/ng_small_literal_set.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -125,7 +125,7 @@ bool findLiterals(const NGHolder &g,
         set<sls_literal> &out = built[g[v].index];
         read_count[g[v].index] = out_degree(v, g);
 
-        DEBUG_PRINTF("setting read_count to %zu for %u\n",
+        DEBUG_PRINTF("setting read_count to %zu for %zu\n",
                       read_count[g[v].index], g[v].index);
 
         assert(out.empty());
@@ -154,7 +154,7 @@ bool findLiterals(const NGHolder &g,
             }
 
             set<sls_literal> &in = built[g[u].index];
-            DEBUG_PRINTF("getting from %u (%zu reads to go)\n",
+            DEBUG_PRINTF("getting from %zu (%zu reads to go)\n",
                           g[u].index, read_count[g[u].index]);
             assert(!in.empty());
             assert(read_count[g[u].index]);
@@ -188,7 +188,7 @@ bool findLiterals(const NGHolder &g,
 
             read_count[g[u].index]--;
             if (!read_count[g[u].index]) {
-                DEBUG_PRINTF("clearing %u as finished reading\n", g[u].index);
+                DEBUG_PRINTF("clearing %zu as finished reading\n", g[u].index);
                 in.clear();
             }
         }
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index ed2942bb..f6ba0fa7 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -40,7 +40,6 @@
 #include "ng_redundancy.h"
 #include "ng_region.h"
 #include "ng_reports.h"
-#include "ng_restructuring.h"
 #include "ng_rose.h"
 #include "ng_som.h"
 #include "ng_som_add_redundancy.h"
@@ -111,7 +110,7 @@ bool regionCanEstablishSom(const NGHolder &g,
 
     DEBUG_PRINTF("region %u\n", region);
     for (UNUSED auto v : r_exits) {
-        DEBUG_PRINTF("    exit %u\n", g[v].index);
+        DEBUG_PRINTF("    exit %zu\n", g[v].index);
     }
 
     /* simple if each region exit is at fixed distance from SOM. Note SOM does
@@ -120,12 +119,12 @@ bool regionCanEstablishSom(const NGHolder &g,
         assert(regions.at(v) == region);
         const DepthMinMax &d = depths.at(g[v].index);
         if (d.min != d.max) {
-            DEBUG_PRINTF("failing %u as %s != %s\n", g[v].index,
+            DEBUG_PRINTF("failing %zu as %s != %s\n", g[v].index,
                          d.min.str().c_str(), d.max.str().c_str());
             return false;
         }
     }
-    DEBUG_PRINTF("region %u/%u is good\n", regions.at(r_exits[0]),
+    DEBUG_PRINTF("region %u/%zu is good\n", regions.at(r_exits[0]),
                  g[r_exits[0]].index);
 
     return true;
@@ -179,10 +178,7 @@ void buildRegionMapping(const NGHolder &g,
 
     set<NFAEdge> be;
     BackEdges<set<NFAEdge> > backEdgeVisitor(be);
-    depth_first_search(
-        g.g, visitor(backEdgeVisitor)
-                 .root_vertex(g.start)
-                 .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
+    boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
 
     for (const auto &e : be) {
         NFAVertex u = source(e, g);
@@ -209,17 +205,17 @@ void buildRegionMapping(const NGHolder &g,
                      r_i.optional ? " (optional)" : "");
         DEBUG_PRINTF("  enters:");
         for (u32 i = 0; i < r_i.enters.size(); i++) {
-            printf(" %u", g[r_i.enters[i]].index);
+            printf(" %zu", g[r_i.enters[i]].index);
         }
         printf("\n");
         DEBUG_PRINTF("  exits:");
         for (u32 i = 0; i < r_i.exits.size(); i++) {
-            printf(" %u", g[r_i.exits[i]].index);
+            printf(" %zu", g[r_i.exits[i]].index);
         }
         printf("\n");
         DEBUG_PRINTF("  all:");
         for (u32 i = 0; i < r_i.full.size(); i++) {
-            printf(" %u", g[r_i.full[i]].index);
+            printf(" %zu", g[r_i.full[i]].index);
         }
         printf("\n");
     }
@@ -236,8 +232,7 @@ bool validateXSL(const NGHolder &g,
         u32 v_region = regions.at(v);
         if (!is_special(v, g) && v_region > region &&
             (escapes & g[v].char_reach).any()) {
-            DEBUG_PRINTF("problem with escapes for %u\n",
-                         g[v].index);
+            DEBUG_PRINTF("problem with escapes for %zu\n", g[v].index);
             first_bad_region = MIN(first_bad_region, v_region);
         }
     }
@@ -403,7 +398,7 @@ makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
     vector<NFAVertex> to_clear;
     assert(contains(lhs_map, curr_exits.front()));
     NFAVertex p_u = lhs_map[curr_exits.front()];
-    DEBUG_PRINTF("p_u: %u\n", prefix[p_u].index);
+    DEBUG_PRINTF("p_u: %zu\n", prefix[p_u].index);
     for (auto p_v : adjacent_vertices_range(p_u, prefix)) {
         auto v = rev_map.at(p_v);
         if (p_v == prefix.accept || regions.at(v) < dead_region) {
@@ -413,7 +408,7 @@ makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
     }
 
     for (auto v : to_clear) {
-        DEBUG_PRINTF("clearing in_edges on %u\n", prefix[v].index);
+        DEBUG_PRINTF("clearing in_edges on %zu\n", prefix[v].index);
         clear_in_edges(v, prefix);
     }
 
@@ -576,7 +571,7 @@ void replaceExternalReportsWithSomRep(ReportManager &rm, NGHolder &g,
         ir.somDistance = param;
         ReportID rep = rm.getInternalId(ir);
 
-        DEBUG_PRINTF("vertex %u, replacing report %u with %u (type %u)\n",
+        DEBUG_PRINTF("vertex %zu, replacing report %u with %u (type %u)\n",
                      g[v].index, report_id, rep, ir_type);
         r_new.insert(rep);
     }
@@ -690,31 +685,26 @@ void fillHolderForLockCheck(NGHolder *out, const NGHolder &g,
                             map<u32, region_info>::const_iterator picked) {
     /* NOTE: This is appropriate for firstMatchIsFirst */
     DEBUG_PRINTF("prepping for lock check\n");
+
     NGHolder &midfix = *out;
-    add_edge(midfix.startDs, midfix.accept, midfix);
 
     map<NFAVertex, NFAVertex> v_map;
     v_map[g.start] = midfix.start;
     v_map[g.startDs] = midfix.startDs;
 
-    map<u32, region_info>::const_iterator jt = picked;
-
     /* include the lock region */
-    assert(jt != info.end());
-    ++jt;
-    assert(!jt->second.dag);
-    assert(jt->second.full.size() == 1);
+    assert(picked != info.end());
+    auto graph_last = next(picked);
 
-    for (; ; --jt) {
+    assert(!graph_last->second.dag);
+    assert(graph_last->second.full.size() == 1);
+
+    for (auto jt = graph_last; ; --jt) {
         DEBUG_PRINTF("adding r %u to midfix\n", jt->first);
-        if (!jt->second.optional) {
-            clear_out_edges(midfix.startDs, midfix);
-            add_edge(midfix.startDs, midfix.startDs, midfix);
-        }
 
         /* add all vertices in region, create mapping */
         for (auto v : jt->second.full) {
-            DEBUG_PRINTF("adding v %u to midfix\n", g[v].index);
+            DEBUG_PRINTF("adding v %zu to midfix\n", g[v].index);
             if (contains(v_map, v)) {
                 continue;
             }
@@ -746,20 +736,33 @@ void fillHolderForLockCheck(NGHolder *out, const NGHolder &g,
             }
         }
 
-        /* add edges from startds to enters */
+        if (jt == info.begin()) {
+            break;
+        }
+    }
+
+    /* add edges from startds to the enters of all the initial optional
+     * regions and the first mandatory region. */
+    for (auto jt = info.begin(); ; ++jt) {
         for (auto enter : jt->second.enters) {
             assert(contains(v_map, enter));
             NFAVertex v = v_map[enter];
             add_edge_if_not_present(midfix.startDs, v, midfix);
         }
 
-        if (jt == info.begin()) {
+        if (!jt->second.optional) {
+            break;
+        }
+
+        if (jt == graph_last) {
+            /* all regions are optional - add a direct edge to accept */
+            add_edge_if_not_present(midfix.startDs, midfix.accept, midfix);
             break;
         }
     }
 
     assert(in_degree(midfix.accept, midfix));
-    midfix.renumberVertices();
+    renumber_vertices(midfix);
 }
 
 static
@@ -786,7 +789,7 @@ void fillRoughMidfix(NGHolder *out, const NGHolder &g,
 
         /* add all vertices in region, create mapping */
         for (auto v : jt->second.full) {
-            DEBUG_PRINTF("adding v %u to midfix\n", g[v].index);
+            DEBUG_PRINTF("adding v %zu to midfix\n", g[v].index);
             NFAVertex vnew = add_vertex(g[v], midfix);
             v_map[v] = vnew;
         }
@@ -826,7 +829,7 @@ void fillRoughMidfix(NGHolder *out, const NGHolder &g,
 
         do {
             for (auto v : jt->second.exits) {
-                DEBUG_PRINTF("adding v %u to midfix\n", g[v].index);
+                DEBUG_PRINTF("adding v %zu to midfix\n", g[v].index);
                 NFAVertex vnew = add_vertex(g[v], midfix);
                 v_map[v] = vnew;
 
@@ -1013,8 +1016,7 @@ bool addPlan(vector<som_plan> &plan, u32 parent) {
 // Fetches all preds of {accept, acceptEod} for this graph.
 static
 void addReporterVertices(const NGHolder &g, vector<NFAVertex> &reporters) {
-    // Order reporter vertices by index for determinism.
-    set<NFAVertex, VertexIndexOrdering<NGHolder> > tmp(g);
+    set<NFAVertex> tmp;
     insert(&tmp, inv_adjacent_vertices(g.accept, g));
     insert(&tmp, inv_adjacent_vertices(g.acceptEod, g));
     tmp.erase(g.accept);
@@ -1022,7 +1024,7 @@ void addReporterVertices(const NGHolder &g, vector<NFAVertex> &reporters) {
 #ifdef DEBUG
     DEBUG_PRINTF("add reporters:");
     for (UNUSED auto v : tmp) {
-        printf(" %u", g[v].index);
+        printf(" %zu", g[v].index);
     }
     printf("\n");
 #endif
@@ -1036,7 +1038,7 @@ void addReporterVertices(const region_info &r, const NGHolder &g,
                          vector<NFAVertex> &reporters) {
     for (auto v : r.exits) {
         if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
-            DEBUG_PRINTF("add reporter %u\n", g[v].index);
+            DEBUG_PRINTF("add reporter %zu\n", g[v].index);
             reporters.push_back(v);
         }
     }
@@ -1049,7 +1051,7 @@ void addMappedReporterVertices(const region_info &r, const NGHolder &g,
                         vector<NFAVertex> &reporters) {
     for (auto v : r.exits) {
         if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
-            DEBUG_PRINTF("adding v=%u\n", g[v].index);
+            DEBUG_PRINTF("adding v=%zu\n", g[v].index);
             ue2::unordered_map<NFAVertex, NFAVertex>::const_iterator it =
                 mapping.find(v);
             assert(it != mapping.end());
@@ -1106,7 +1108,7 @@ void expandGraph(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &regions,
     }
 
     for (auto enter : enters) {
-        DEBUG_PRINTF("processing enter %u\n", g[enter].index);
+        DEBUG_PRINTF("processing enter %zu\n", g[enter].index);
         map<NFAVertex, NFAVertex> orig_to_copy;
 
         // Make a copy of all of the tail vertices, storing region info along
@@ -1156,7 +1158,7 @@ void expandGraph(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &regions,
                               [&](const NFAEdge &e) {
                                     NFAVertex u = source(e, g);
                                     return regions.at(u) < split_region;
-                              }, g.g);
+                              }, g);
         }
 
         new_enters.push_back(orig_to_copy[enter]);
@@ -1328,7 +1330,7 @@ bool doTreePlanning(NGHolder &g,
     dumpHolder(g, g_regions, 14, "som_expandedtree", grey);
 
     for (auto v : enters) {
-        DEBUG_PRINTF("enter %u\n", g[v].index);
+        DEBUG_PRINTF("enter %zu\n", g[v].index);
 
         // For this entry vertex, construct a version of the graph without the
         // other entries in this region (g_path), and calculate its depths and
@@ -1563,12 +1565,12 @@ void dumpSomPlan(UNUSED const NGHolder &g, UNUSED const som_plan &p,
                  p.is_reset, p.parent);
     printf("  reporters:");
     for (auto v : p.reporters) {
-        printf(" %u", g[v].index);
+        printf(" %zu", g[v].index);
     }
     printf("\n");
     printf("  reporters_in:");
     for (auto v : p.reporters_in) {
-        printf(" %u", g[v].index);
+        printf(" %zu", g[v].index);
     }
     printf("\n");
 #endif
@@ -1634,7 +1636,7 @@ void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
 
     /* create prefix to set the som_loc */
     if (!plan.front().no_implement) {
-        plan.front().prefix->renumberVertices();
+        renumber_vertices(*plan.front().prefix);
         assert(plan.front().prefix->kind == NFA_OUTFIX);
         if (!ng.addHolder(*plan.front().prefix)) {
             throw CompileError(w.expressionIndex, "Pattern is too large.");
@@ -1746,7 +1748,7 @@ aligned_unique_ptr<NFA> makeBareSomRevNfa(const NGHolder &g,
     setZeroReports(g_rev);
 
     // Prep for actual construction.
-    g_rev.renumberVertices();
+    renumber_vertices(g_rev);
     g_rev.kind = NFA_REV_PREFIX;
     reduceGraphEquivalences(g_rev, cc);
     removeRedundancy(g_rev, SOM_NONE);
@@ -1786,7 +1788,7 @@ bool makeSomRevNfa(vector<SomRevNfa> &som_nfas, const NGHolder &g,
         return true;
     }
 
-    g2.renumberVertices(); // for findMinWidth, findMaxWidth.
+    renumber_vertices(g2); // for findMinWidth, findMaxWidth.
 
     aligned_unique_ptr<NFA> nfa = makeBareSomRevNfa(g2, cc);
     if (!nfa) {
@@ -2221,7 +2223,7 @@ bool leadingLiterals(const NGHolder &g, set<ue2_literal> *lits,
         for (const auto &m : curr) {
             const NFAVertex u = m.first;
             const vector<ue2_literal> &base = m.second;
-            DEBUG_PRINTF("expanding from %u\n", g[u].index);
+            DEBUG_PRINTF("expanding from %zu\n", g[u].index);
             for (auto v : adjacent_vertices_range(u, g)) {
                 if (v == g.startDs) {
                     continue;
@@ -2234,8 +2236,7 @@ bool leadingLiterals(const NGHolder &g, set<ue2_literal> *lits,
                     DEBUG_PRINTF("match\n");
                     goto skip_to_next_terminal;
                 }
-                if (g[v].char_reach.count()
-                    > 2 * MAX_LEADING_LITERALS) {
+                if (g[v].char_reach.count() > 2 * MAX_LEADING_LITERALS) {
                     DEBUG_PRINTF("wide\n");
                     goto skip_to_next_terminal;
                 }
@@ -2251,8 +2252,8 @@ bool leadingLiterals(const NGHolder &g, set<ue2_literal> *lits,
                 CharReach cr = g[v].char_reach;
                 vector<ue2_literal> &out = next[v];
 
-                DEBUG_PRINTF("expanding to %u (|| = %zu)\n",
-                             g[v].index, cr.count());
+                DEBUG_PRINTF("expanding to %zu (|| = %zu)\n", g[v].index,
+                             cr.count());
                 for (size_t c = cr.find_first(); c != CharReach::npos;
                      c = cr.find_next(c)) {
                     bool nocase = ourisalpha(c) && cr.test(mytoupper(c))
@@ -2328,7 +2329,7 @@ bool splitOffLeadingLiterals(const NGHolder &g, set<ue2_literal> *lit_out,
     set<NFAVertex> adj_term1;
     insert(&adj_term1, adjacent_vertices(*terms.begin(), g));
     for (auto v : terms) {
-        DEBUG_PRINTF("term %u\n", g[v].index);
+        DEBUG_PRINTF("term %zu\n", g[v].index);
         set<NFAVertex> temp;
         insert(&temp, adjacent_vertices(v, g));
         if (temp != adj_term1) {
@@ -2355,7 +2356,7 @@ void findBestLiteral(const NGHolder &g,
     buildRegionMapping(g, regions, info, false);
 
     ue2_literal best;
-    NFAVertex best_v = nullptr;
+    NFAVertex best_v = NGHolder::null_vertex();
 
     map<u32, region_info>::const_iterator lit = info.begin();
     while (1) {
@@ -2391,7 +2392,7 @@ bool splitOffBestLiteral(const NGHolder &g,
                          const ue2::unordered_map<NFAVertex, u32> &regions,
                          ue2_literal *lit_out, NGHolder *lhs, NGHolder *rhs,
                          const CompileContext &cc) {
-    NFAVertex v = nullptr;
+    NFAVertex v = NGHolder::null_vertex();
 
     findBestLiteral(g, regions, lit_out, &v, cc);
     if (lit_out->empty()) {
@@ -2405,7 +2406,7 @@ bool splitOffBestLiteral(const NGHolder &g,
 
     splitGraph(g, v, lhs, &lhs_map, rhs, &rhs_map);
 
-    DEBUG_PRINTF("v = %u\n", g[v].index);
+    DEBUG_PRINTF("v = %zu\n", g[v].index);
 
     return true;
 }
@@ -2625,7 +2626,7 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
         }
     } else {
         DEBUG_PRINTF("has start->accept edge\n");
-        if (hasGreaterInDegree(1, g.acceptEod, g)) {
+        if (in_degree(g.acceptEod, g) > 1) {
             DEBUG_PRINTF("also has a path to EOD\n");
             return false;
         }
@@ -2826,7 +2827,7 @@ map<u32, region_info>::const_iterator tryForLaterRevNfaCut(const NGHolder &g,
         reverseHolder(*prefix, g_rev);
         anchorStarts(g_rev);
 
-        g_rev.renumberVertices();
+        renumber_vertices(g_rev);
         g_rev.kind = NFA_REV_PREFIX;
         reduceGraphEquivalences(g_rev, cc);
         removeRedundancy(g_rev, SOM_NONE);
@@ -2870,7 +2871,7 @@ unique_ptr<NGHolder> makePrefixForChain(NGHolder &g,
     }
 
     depths->clear(); /* renumbering invalidates depths */
-    prefix->renumberVertices();
+    renumber_vertices(*prefix);
 
     DEBUG_PRINTF("done\n");
     return prefix;
@@ -2886,8 +2887,7 @@ sombe_rv doSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
 
     // Special case: if g is completely anchored or begins with a dot-star, we
     // know that we have an absolute SOM of zero all the time.
-    assert(edge(g.startDs, g.startDs, g).second);
-    if (!hasGreaterOutDegree(1, g.startDs, g) || beginsWithDotStar(g)) {
+    if (!proper_out_degree(g.startDs, g) || beginsWithDotStar(g)) {
         makeSomAbsReports(rm, g, g.accept);
         makeSomAbsReports(rm, g, g.acceptEod);
         return SOMBE_HANDLED_INTERNAL;
@@ -3004,7 +3004,7 @@ sombe_rv doSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
             u32 rev_comp_id = doSomRevNfaPrefix(ng, w, *prefix, cc);
             updatePrefixReportsRevNFA(rm, *prefix, rev_comp_id);
         }
-        prefix->renumberVertices();
+        renumber_vertices(*prefix);
         if (!ng.addHolder(*prefix)) {
             DEBUG_PRINTF("failed to add holder\n");
             clear_graph(g);
diff --git a/src/nfagraph/ng_som_add_redundancy.cpp b/src/nfagraph/ng_som_add_redundancy.cpp
index 924cfad1..33544ec1 100644
--- a/src/nfagraph/ng_som_add_redundancy.cpp
+++ b/src/nfagraph/ng_som_add_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -155,13 +155,13 @@ bool addSomRedundancy(NGHolder &g, vector<DepthMinMax> &depths) {
         if (is_special(v, g)) {
             continue;
         }
-        if (!hasGreaterInDegree(0, v, g)) {
+        if (!in_degree(v, g)) {
             continue; // unreachable, probably killed
         }
 
         const DepthMinMax &d = getDepth(v, g, depths);
 
-        DEBUG_PRINTF("vertex %u has depths %s\n", g[v].index,
+        DEBUG_PRINTF("vertex %zu has depths %s\n", g[v].index,
                      d.str().c_str());
 
         if (d.min == d.max) {
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 676fb523..c4337341 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -76,7 +76,7 @@ vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
         clear_in_edges(v, g);
     }
 
-    //dumpGraph("som_depth.dot", g.g);
+    //dumpGraph("som_depth.dot", g);
 
     vector<DepthMinMax> temp_depths; // numbered by vertex index in g
     calcDepthsFrom(g, g.start, temp_depths);
@@ -143,7 +143,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
     for (auto v : vertices_range(p)) {
         assert(!is_virtual_start(v, p));
         if (!is_special(v, p)) {
-            DEBUG_PRINTF("turning on %u\n", p[v].index);
+            DEBUG_PRINTF("turning on %zu\n", p[v].index);
             states.insert(v);
         }
     }
@@ -154,9 +154,9 @@ bool firstMatchIsFirst(const NGHolder &p) {
     for (auto v : states) {
         /* need to check if this vertex may represent an infix match - ie
          * it does not have an edge to accept. */
-        DEBUG_PRINTF("check %u\n", p[v].index);
+        DEBUG_PRINTF("check %zu\n", p[v].index);
         if (!edge(v, p.accept, p).second) {
-            DEBUG_PRINTF("fail %u\n", p[v].index);
+            DEBUG_PRINTF("fail %zu\n", p[v].index);
             return false;
         }
     }
@@ -186,14 +186,11 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
         return cache.smgb[u];
     }
 
-    DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index);
+    DEBUG_PRINTF("checking if som can go backwards on %zu\n", g[u].index);
 
     set<NFAEdge> be;
     BackEdges<set<NFAEdge>> backEdgeVisitor(be);
-    depth_first_search(
-        g.g, visitor(backEdgeVisitor)
-                 .root_vertex(g.start)
-                 .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
+    boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
 
     bool rv;
     if (0) {
@@ -210,8 +207,7 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
         NFAVertex s = source(e, g);
         NFAVertex t = target(e, g);
         /* only need to worry about big cycles including/before u */
-        DEBUG_PRINTF("back edge %u %u\n", g[s].index,
-                      g[t].index);
+        DEBUG_PRINTF("back edge %zu %zu\n", g[s].index, g[t].index);
         if (s != t && region_map.at(s) <= u_region) {
             DEBUG_PRINTF("eek big cycle\n");
             rv = true; /* big cycle -> eek */
@@ -268,13 +264,13 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
     pruneUseless(c_g);
 
     be.clear();
-    depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start).
-                       vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g)));
+    boost::depth_first_search(c_g, visitor(backEdgeVisitor)
+                                   .root_vertex(c_g.start));
 
     for (const auto &e : be) {
         NFAVertex s = source(e, c_g);
         NFAVertex t = target(e, c_g);
-        DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index);
+        DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
         if (s != t) {
             assert(0);
             DEBUG_PRINTF("eek big cycle\n");
@@ -326,7 +322,7 @@ bool sentClearsTail(const NGHolder &g,
     }
 
     for (UNUSED auto v : states) {
-        DEBUG_PRINTF("start state: %u\n", g[v].index);
+        DEBUG_PRINTF("start state: %zu\n", g[v].index);
     }
 
     /* run the prefix the main graph */
@@ -338,7 +334,7 @@ bool sentClearsTail(const NGHolder &g,
             continue; /* not in tail */
         }
 
-        DEBUG_PRINTF("v %u is still on\n", g[v].index);
+        DEBUG_PRINTF("v %zu is still on\n", g[v].index);
         assert(v != g.accept && v != g.acceptEod); /* no cr */
 
         assert(contains(region_map, v));
diff --git a/src/nfagraph/ng_split.cpp b/src/nfagraph/ng_split.cpp
index bce638c0..3c2baee4 100644
--- a/src/nfagraph/ng_split.cpp
+++ b/src/nfagraph/ng_split.cpp
@@ -87,7 +87,7 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
     clearAccepts(*lhs);
 
     for (auto pivot : pivots) {
-        DEBUG_PRINTF("pivot is %u lv %zu lm %zu\n", base[pivot].index,
+        DEBUG_PRINTF("pivot is %zu lv %zu lm %zu\n", base[pivot].index,
                      num_vertices(*lhs), lhs_map->size());
         assert(contains(*lhs_map, pivot));
 
@@ -151,7 +151,8 @@ void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
 
     for (auto pivot : pivots) {
         assert(contains(*rhs_map, pivot));
-        add_edge(rhs->start, (*rhs_map)[pivot], *rhs);
+        NFAEdge e = add_edge(rhs->start, (*rhs_map)[pivot], *rhs);
+        (*rhs)[e].tops.insert(DEFAULT_TOP);
     }
 
      /* should do the renumbering unconditionally as we know edges are already
@@ -190,8 +191,8 @@ void findCommonSuccessors(const NGHolder &g, const vector<NFAVertex> &pivots,
                           vector<NFAVertex> &succ) {
     assert(!pivots.empty());
 
-    // Note: for determinism, we must sort our successor sets by vertex_index.
-    set<NFAVertex, VertexIndexOrdering<NGHolder> > adj(g), adj_temp(g);
+    set<NFAVertex> adj;
+    set<NFAVertex> adj_temp;
 
     insert(&adj, adjacent_vertices(pivots.at(0), g));
 
@@ -215,6 +216,7 @@ void splitGraph(const NGHolder &base, const vector<NFAVertex> &pivots,
     DEBUG_PRINTF("splitting graph at %zu vertices\n", pivots.size());
 
     assert(!has_parallel_edge(base));
+    assert(isCorrectlyTopped(base));
 
     /* RHS pivots are built from the common set of successors of pivots. */
     vector<NFAVertex> rhs_pivots;
@@ -228,6 +230,8 @@ void splitGraph(const NGHolder &base, const vector<NFAVertex> &pivots,
 
     assert(!has_parallel_edge(*lhs));
     assert(!has_parallel_edge(*rhs));
+    assert(isCorrectlyTopped(*lhs));
+    assert(isCorrectlyTopped(*rhs));
 }
 
 void splitGraph(const NGHolder &base, NFAVertex pivot,
diff --git a/src/nfagraph/ng_split.h b/src/nfagraph/ng_split.h
index 75577e97..31c1cf35 100644
--- a/src/nfagraph/ng_split.h
+++ b/src/nfagraph/ng_split.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@ class NGHolder;
  * is in the lhs if it is reachable from start without going through the
  * pivot. The pivot ends up in the LHS and any adjacent vertices in the RHS.
  *
+ * Note: The RHS is setup to be triggered by TOP 0
+ *
  * When multiple split vertices are provided:
  * - RHS contains all vertices reachable from every pivot
  * - LHS contains all vertices which are reachable from start ignoring any
diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index 6577673f..ebec3a4a 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -102,7 +102,6 @@
 #include "ng_holder.h"
 #include "ng_prune.h"
 #include "ng_region.h"
-#include "ng_restructuring.h"
 #include "ng_som_util.h"
 #include "ng_util.h"
 #include "ng_util.h"
@@ -135,8 +134,7 @@ void buildPDomTree(const NGHolder &g, PostDomTree &tree) {
         }
         NFAVertex pdom = postdominators[v];
         if (pdom) {
-            DEBUG_PRINTF("vertex %u -> %u\n", g[pdom].index,
-                         g[v].index);
+            DEBUG_PRINTF("vertex %zu -> %zu\n", g[pdom].index, g[v].index);
             tree[pdom].insert(v);
         }
     }
@@ -154,8 +152,7 @@ void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v,
                      som_type som, const vector<DepthMinMax> &som_depths,
                      const ue2::unordered_map<NFAVertex, u32> &region_map,
                      smgb_cache &cache) {
-    DEBUG_PRINTF("build base squash mask for vertex %u)\n",
-                 g[v].index);
+    DEBUG_PRINTF("build base squash mask for vertex %zu)\n", g[v].index);
 
     vector<NFAVertex> q;
 
@@ -302,7 +299,7 @@ void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
             }
 
             NFAStateSet u_squash(init.size());
-            u32 u_index = g[u].index;
+            size_t u_index = g[u].index;
 
             buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex,
                             pdom_tree, som, som_depths, region_map, cache);
@@ -310,7 +307,7 @@ void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
             u_squash.set(u_index); /* never clear ourselves */
 
             if ((~u_squash).any()) { // i.e. some bits unset in mask
-                DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index,
+                DEBUG_PRINTF("%zu is an upstream squasher of %zu\n", u_index,
                              g[v].index);
                 (*squash)[u] = u_squash;
                 remaining.push_back(u);
@@ -522,8 +519,7 @@ void filterSquashers(const NGHolder &g,
         if (!contains(squash, v)) {
             continue;
         }
-        DEBUG_PRINTF("looking at squash set for vertex %u\n",
-                     g[v].index);
+        DEBUG_PRINTF("looking at squash set for vertex %zu\n", g[v].index);
 
         if (!hasSelfLoop(v, g)) {
             DEBUG_PRINTF("acyclic\n");
@@ -601,7 +597,7 @@ void removeEdgesToAccept(NGHolder &g, NFAVertex v) {
         NFAVertex u = source(e, g);
         const auto &r = g[u].reports;
         if (!r.empty() && is_subset_of(r, reports)) {
-            DEBUG_PRINTF("vertex %u\n", g[u].index);
+            DEBUG_PRINTF("vertex %zu\n", g[u].index);
             dead.insert(e);
         }
     }
@@ -610,7 +606,7 @@ void removeEdgesToAccept(NGHolder &g, NFAVertex v) {
         NFAVertex u = source(e, g);
         const auto &r = g[u].reports;
         if (!r.empty() && is_subset_of(r, reports)) {
-            DEBUG_PRINTF("vertex %u\n", g[u].index);
+            DEBUG_PRINTF("vertex %zu\n", g[u].index);
             dead.insert(e);
         }
     }
@@ -621,7 +617,7 @@ void removeEdgesToAccept(NGHolder &g, NFAVertex v) {
 
 static
 vector<NFAVertex> findUnreachable(const NGHolder &g) {
-    const boost::reverse_graph<NFAGraph, const NFAGraph &> revg(g.g);
+    const boost::reverse_graph<NGHolder, const NGHolder &> revg(g);
 
     ue2::unordered_map<NFAVertex, boost::default_color_type> colours;
     colours.reserve(num_vertices(g));
@@ -634,7 +630,7 @@ vector<NFAVertex> findUnreachable(const NGHolder &g) {
     vector<NFAVertex> unreach;
     for (auto v : vertices_range(revg)) {
         if (!contains(colours, v)) {
-            unreach.push_back(v);
+            unreach.push_back(NFAVertex(v));
         }
     }
     return unreach;
@@ -657,7 +653,7 @@ findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
     const u32 numStates = num_vertices(g);
 
     for (auto v : verts) {
-        DEBUG_PRINTF("vertex %u with %zu reports\n", g[v].index,
+        DEBUG_PRINTF("vertex %zu with %zu reports\n", g[v].index,
                      g[v].reports.size());
 
         // Find the set of vertices that lead to v or any other reporter with a
@@ -684,7 +680,7 @@ findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
         NFAStateSet &mask = squash[v];
 
         for (auto uv : unreach) {
-            DEBUG_PRINTF("squashes index %u\n", h[uv].index);
+            DEBUG_PRINTF("squashes index %zu\n", h[uv].index);
             mask.reset(h[uv].index);
         }
     }
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index 217183de..4ad5ff78 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -39,7 +39,6 @@
 #include "ng_limex.h"
 #include "ng_redundancy.h"
 #include "ng_region.h"
-#include "ng_restructuring.h"
 #include "ng_uncalc_components.h"
 #include "ng_util.h"
 #include "ue2common.h"
@@ -55,42 +54,52 @@
 #include <set>
 #include <vector>
 
+#include <boost/range/adaptor/map.hpp>
+
 using namespace std;
+using boost::adaptors::map_values;
 
 namespace ue2 {
 
 static const u32 FAST_STATE_LIMIT = 256; /**< largest possible desirable NFA */
 
 /** Sentinel value meaning no component has yet been selected. */
-static const u32 NO_COMPONENT = 0xffffffffu;
+static const u32 NO_COMPONENT = ~0U;
 
-static
-vector<NFAVertex> getSortedVA(const NGHolder &g,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids) {
-    vector<NFAVertex> out;
-    out.reserve(num_vertices(g));
+static const u32 UNUSED_STATE = ~0U;
 
-    for (auto v : vertices_range(g)) {
-        assert(contains(state_ids, v));
-        if (state_ids.at(v) == NO_STATE) {
-            continue;
+namespace {
+struct ranking_info {
+    explicit ranking_info(const NGHolder &h) : to_vertex(getTopoOrdering(h)) {
+        u32 rank = 0;
+
+        reverse(to_vertex.begin(), to_vertex.end());
+
+        for (NFAVertex v : to_vertex) {
+            to_rank[v] = rank++;
+        }
+
+        for (NFAVertex v : vertices_range(h)) {
+            if (!contains(to_rank, v)) {
+                to_rank[v] = UNUSED_STATE;
+            }
         }
-        out.push_back(v);
     }
 
-    // Order vertices by their state indices.
-    sort(begin(out), end(out), [&state_ids](NFAVertex a, NFAVertex b) {
-        return state_ids.at(a) < state_ids.at(b);
-    });
-
-#ifndef NDEBUG
-    // State indices should match vector indices.
-    for (u32 i = 0; i < out.size(); i++) {
-        assert(state_ids.at(out.at(i)) == i);
+    NFAVertex at(u32 ranking) const { return to_vertex.at(ranking); }
+    u32 get(NFAVertex v) const { return to_rank.at(v); }
+    u32 size() const { return (u32)to_vertex.size(); }
+    u32 add_to_tail(NFAVertex v) {
+        u32 rank = size();
+        to_rank[v] = rank;
+        to_vertex.push_back(v);
+        return rank;
     }
-#endif
 
-    return out;
+private:
+    vector<NFAVertex> to_vertex;
+    unordered_map<NFAVertex, u32> to_rank;
+};
 }
 
 static never_inline
@@ -122,9 +131,9 @@ bool cplVerticesMatch(const NGHolder &ga, NFAVertex va,
 }
 
 static never_inline
-u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
-                            const NGHolder &gb, const vector<NFAVertex> &b) {
-    u32 ml = min(a.size(), b.size());
+u32 cplCommonReachAndSimple(const NGHolder &ga, const ranking_info &a_ranking,
+                            const NGHolder &gb, const ranking_info &b_ranking) {
+    u32 ml = min(a_ranking.size(), b_ranking.size());
     if (ml > 65535) {
         ml = 65535;
     }
@@ -133,7 +142,7 @@ u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
     // "startedness" properties.
     u32 max = 0;
     for (; max < ml; max++) {
-        if (!cplVerticesMatch(ga, a[max], gb, b[max])) {
+        if (!cplVerticesMatch(ga, a_ranking.at(max), gb, b_ranking.at(max))) {
             break;
         }
     }
@@ -141,34 +150,30 @@ u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
     return max;
 }
 
-u32 commonPrefixLength(const NGHolder &ga,
-                       const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                       const NGHolder &gb,
-                       const ue2::unordered_map<NFAVertex, u32> &b_state_ids) {
-    vector<NFAVertex> a = getSortedVA(ga, a_state_ids);
-    vector<NFAVertex> b = getSortedVA(gb, b_state_ids);
-
+static
+u32 commonPrefixLength(const NGHolder &ga, const ranking_info &a_ranking,
+                       const NGHolder &gb, const ranking_info &b_ranking) {
     /* upper bound on the common region based on local properties */
-    u32 max = cplCommonReachAndSimple(ga, a, gb, b);
+    u32 max = cplCommonReachAndSimple(ga, a_ranking, gb, b_ranking);
     DEBUG_PRINTF("cpl upper bound %u\n", max);
 
     while (max > 0) {
-        bool ok = true;
-
         /* shrink max region based on in-edges from outside the region */
         for (size_t j = max; j > 0; j--) {
-            for (auto u : inv_adjacent_vertices_range(a[j - 1], ga)) {
-                u32 state_id = a_state_ids.at(u);
-                if (state_id != NO_STATE && state_id >= max) {
+            NFAVertex a_v = a_ranking.at(j - 1);
+            NFAVertex b_v = b_ranking.at(j - 1);
+            for (auto u : inv_adjacent_vertices_range(a_v, ga)) {
+                u32 state_id = a_ranking.get(u);
+                if (state_id != UNUSED_STATE && state_id >= max) {
                     max = j - 1;
                     DEBUG_PRINTF("lowering max to %u\n", max);
                     goto next_vertex;
                 }
             }
 
-            for (auto u : inv_adjacent_vertices_range(b[j - 1], gb)) {
-                u32 state_id = b_state_ids.at(u);
-                if (state_id != NO_STATE && state_id >= max) {
+            for (auto u : inv_adjacent_vertices_range(b_v, gb)) {
+                u32 state_id = b_ranking.get(u);
+                if (state_id != UNUSED_STATE && state_id >= max) {
                     max = j - 1;
                     DEBUG_PRINTF("lowering max to %u\n", max);
                     goto next_vertex;
@@ -180,44 +185,37 @@ u32 commonPrefixLength(const NGHolder &ga,
 
         /* Ensure that every pair of vertices has same out-edges to vertices in
            the region. */
-        for (size_t i = 0; ok && i < max; i++) {
+        for (size_t i = 0; i < max; i++) {
             size_t a_count = 0;
             size_t b_count = 0;
 
-            NGHolder::out_edge_iterator ei, ee;
-            for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) {
-                u32 sid = a_state_ids.at(target(*ei, ga));
-                if (sid == NO_STATE || sid >= max) {
+            for (NFAEdge a_edge : out_edges_range(a_ranking.at(i), ga)) {
+                u32 sid = a_ranking.get(target(a_edge, ga));
+                if (sid == UNUSED_STATE || sid >= max) {
                     continue;
                 }
 
                 a_count++;
 
-                NFAEdge b_edge;
-                bool has_b_edge;
-                tie(b_edge, has_b_edge) = edge(b[i], b[sid], gb);
+                NFAEdge b_edge = edge(b_ranking.at(i), b_ranking.at(sid), gb);
 
-                if (!has_b_edge) {
+                if (!b_edge) {
                     max = i;
-                    ok = false;
                     DEBUG_PRINTF("lowering max to %u due to edge %zu->%u\n",
                                  max, i, sid);
-                    break;
+                    goto try_smaller;
                 }
 
-                if (ga[*ei].top != gb[b_edge].top) {
+                if (ga[a_edge].tops != gb[b_edge].tops) {
                     max = i;
-                    ok = false;
-                    DEBUG_PRINTF("tops don't match on edge %zu->%u\n",
-                                 i, sid);
+                    DEBUG_PRINTF("tops don't match on edge %zu->%u\n", i, sid);
+                    goto try_smaller;
                 }
             }
 
-            NGHolder::adjacency_iterator ai, ae;
-            for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae;
-                 ++ai) {
-                u32 sid = b_state_ids.at(*ai);
-                if (sid == NO_STATE || sid >= max) {
+            for (NFAVertex b_v : adjacent_vertices_range(b_ranking.at(i), gb)) {
+                u32 sid = b_ranking.get(b_v);
+                if (sid == UNUSED_STATE || sid >= max) {
                     continue;
                 }
 
@@ -226,52 +224,54 @@ u32 commonPrefixLength(const NGHolder &ga,
 
             if (a_count != b_count) {
                 max = i;
-                DEBUG_PRINTF("lowering max to %u due to a,b count "
-                             "(a_count=%zu, b_count=%zu)\n", max, a_count,
-                             b_count);
-                ok = false;
+                DEBUG_PRINTF("lowering max to %u due to a,b count (a_count=%zu,"
+                             " b_count=%zu)\n", max, a_count, b_count);
+                goto try_smaller;
             }
         }
 
-        if (ok) {
-            DEBUG_PRINTF("survived checks, returning cpl %u\n", max);
-            return max;
-        }
+        DEBUG_PRINTF("survived checks, returning cpl %u\n", max);
+        return max;
+    try_smaller:;
     }
 
     DEBUG_PRINTF("failed to find any common region\n");
     return 0;
 }
 
+u32 commonPrefixLength(const NGHolder &ga, const NGHolder &gb) {
+    return commonPrefixLength(ga, ranking_info(ga), gb, ranking_info(gb));
+}
+
 static never_inline
-void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
-              ue2::unordered_map<NFAVertex, u32> &dest_state_ids,
-              NGHolder &vic, vector<NFAVertex> &vicStateMap,
-              size_t common_len) {
+void mergeNfaComponent(NGHolder &dest, const NGHolder &vic, size_t common_len) {
+    assert(&dest != &vic);
+
+    auto dest_info = ranking_info(dest);
+    auto vic_info = ranking_info(vic);
+
     map<NFAVertex, NFAVertex> vmap; // vic -> dest
 
     vmap[vic.start]     = dest.start;
     vmap[vic.startDs]   = dest.startDs;
     vmap[vic.accept]    = dest.accept;
     vmap[vic.acceptEod] = dest.acceptEod;
-    vmap[nullptr] = nullptr;
-
-    u32 stateNum = countStates(dest, dest_state_ids);
+    vmap[NGHolder::null_vertex()] = NGHolder::null_vertex();
 
     // For vertices in the common len, add to vmap and merge in the reports, if
     // any.
     for (u32 i = 0; i < common_len; i++) {
-        NFAVertex v_old = vicStateMap[i], v = destStateMap[i];
+        NFAVertex v_old = vic_info.at(i);
+        NFAVertex v = dest_info.at(i);
         vmap[v_old] = v;
 
         const auto &reports = vic[v_old].reports;
         dest[v].reports.insert(reports.begin(), reports.end());
     }
 
-    // Add in vertices beyond the common len, giving them state numbers
-    // starting at stateNum.
-    for (u32 i = common_len; i < vicStateMap.size(); i++) {
-        NFAVertex v_old = vicStateMap[i];
+    // Add in vertices beyond the common len
+    for (u32 i = common_len; i < vic_info.size(); i++) {
+        NFAVertex v_old = vic_info.at(i);
 
         if (is_special(v_old, vic)) {
             // Dest already has start vertices, just merge the reports.
@@ -283,15 +283,17 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
         }
 
         NFAVertex v = add_vertex(vic[v_old], dest);
-        dest_state_ids[v] = stateNum++;
+        dest_info.add_to_tail(v);
         vmap[v_old] = v;
     }
 
     /* add edges */
     DEBUG_PRINTF("common_len=%zu\n", common_len);
     for (const auto &e : edges_range(vic)) {
-        NFAVertex u_old = source(e, vic), v_old = target(e, vic);
-        NFAVertex u = vmap[u_old], v = vmap[v_old];
+        NFAVertex u_old = source(e, vic);
+        NFAVertex v_old = target(e, vic);
+        NFAVertex u = vmap[u_old];
+        NFAVertex v = vmap[v_old];
         bool uspecial = is_special(u, dest);
         bool vspecial = is_special(v, dest);
 
@@ -302,15 +304,14 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
 
         // We're in the common region if v's state ID is low enough, unless v
         // is a special (an accept), in which case we use u's state ID.
-        assert(contains(dest_state_ids, v));
-        bool in_common_region = dest_state_ids.at(v) < common_len;
-        if (vspecial && dest_state_ids.at(u) < common_len) {
+        bool in_common_region = dest_info.get(v) < common_len;
+        if (vspecial && dest_info.get(u) < common_len) {
             in_common_region = true;
         }
 
-        DEBUG_PRINTF("adding idx=%u (state %u) -> idx=%u (state %u)%s\n",
-                     dest[u].index, dest_state_ids.at(u),
-                     dest[v].index, dest_state_ids.at(v),
+        DEBUG_PRINTF("adding idx=%zu (state %u) -> idx=%zu (state %u)%s\n",
+                     dest[u].index, dest_info.get(u),
+                     dest[v].index, dest_info.get(v),
                      in_common_region ? " [common]" : "");
 
         if (in_common_region) {
@@ -318,7 +319,7 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
                 DEBUG_PRINTF("skipping common edge\n");
                 assert(edge(u, v, dest).second);
                 // Should never merge edges with different top values.
-                assert(vic[e].top == dest[edge(u, v, dest).first].top);
+                assert(vic[e].tops == dest[edge(u, v, dest)].tops);
                 continue;
             } else {
                 assert(is_any_accept(v, dest));
@@ -334,20 +335,8 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
         add_edge(u, v, vic[e], dest);
     }
 
-    dest.renumberEdges();
-    dest.renumberVertices();
-}
-
-static never_inline
-void mergeNfaComponent(NGHolder &pholder, NGHolder &vholder, size_t cpl) {
-    assert(&pholder != &vholder);
-
-    auto v_state_ids = numberStates(vholder);
-    auto p_state_ids = numberStates(pholder);
-    auto vhvmap = getSortedVA(vholder, v_state_ids);
-    auto phvmap = getSortedVA(pholder, p_state_ids);
-
-    mergeNfa(pholder, phvmap, p_state_ids, vholder, vhvmap, cpl);
+    renumber_edges(dest);
+    renumber_vertices(dest);
 }
 
 namespace {
@@ -374,14 +363,19 @@ struct NfaMergeCandidateH {
 
 /** Returns true if graphs \p h1 and \p h2 can (and should) be merged. */
 static
-bool shouldMerge(NGHolder &ha,
-                 const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                 NGHolder &hb,
-                 const ue2::unordered_map<NFAVertex, u32> &b_state_ids,
-                 size_t cpl, const ReportManager *rm,
-                 const CompileContext &cc) {
-    size_t combinedStateCount =
-        countStates(ha, a_state_ids) + countStates(hb, b_state_ids) - cpl;
+bool shouldMerge(const NGHolder &ha, const NGHolder &hb, size_t cpl,
+                 const ReportManager *rm, const CompileContext &cc) {
+    size_t combinedStateCount = num_vertices(ha) + num_vertices(hb) - cpl;
+
+    combinedStateCount -= 2 * 2; /* discount accepts from both */
+
+    if (is_triggered(ha)) {
+        /* allow for a state for each top, ignore existing starts */
+        combinedStateCount -= 2; /* for start, startDs */
+        auto tops = getTops(ha);
+        insert(&tops, getTops(hb));
+        combinedStateCount += tops.size();
+    }
 
     if (combinedStateCount > FAST_STATE_LIMIT) {
         // More complex implementability check.
@@ -424,11 +418,13 @@ void buildNfaMergeQueue(const vector<NGHolder *> &cluster,
 
     // First, make sure all holders have numbered states and collect their
     // counts.
-    vector<ue2::unordered_map<NFAVertex, u32>> states_map(cs);
+    vector<ranking_info> states_map;
+    states_map.reserve(cs);
     for (size_t i = 0; i < cs; i++) {
         assert(cluster[i]);
-        NGHolder &g = *(cluster[i]);
-        states_map[i] = numberStates(g);
+        assert(states_map.size() == i);
+        const NGHolder &g = *(cluster[i]);
+        states_map.emplace_back(g);
     }
 
     vector<u16> seen_cpl(cs * cs, 0);
@@ -506,26 +502,25 @@ bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) {
         return false;
     }
 
+    /* TODO: relax top checks if reports match */
+
     // If both graphs have edge (start, accept), the tops must match.
-    auto e1_accept = edge(h1.start, h1.accept, h1);
-    auto e2_accept = edge(h2.start, h2.accept, h2);
-    if (e1_accept.second && e2_accept.second &&
-        h1[e1_accept.first].top != h2[e2_accept.first].top) {
+    NFAEdge e1_accept = edge(h1.start, h1.accept, h1);
+    NFAEdge e2_accept = edge(h2.start, h2.accept, h2);
+    if (e1_accept && e2_accept && h1[e1_accept].tops != h2[e2_accept].tops) {
         return false;
     }
 
     // If both graphs have edge (start, acceptEod), the tops must match.
-    auto e1_eod = edge(h1.start, h1.acceptEod, h1);
-    auto e2_eod = edge(h2.start, h2.acceptEod, h2);
-    if (e1_eod.second && e2_eod.second &&
-        h1[e1_eod.first].top != h2[e2_eod.first].top) {
+    NFAEdge e1_eod = edge(h1.start, h1.acceptEod, h1);
+    NFAEdge e2_eod = edge(h2.start, h2.acceptEod, h2);
+    if (e1_eod && e2_eod && h1[e1_eod].tops != h2[e2_eod].tops) {
         return false;
     }
 
     // If one graph has an edge to accept and the other has an edge to
     // acceptEod, the reports must match for the merge to be safe.
-    if ((e1_accept.second && e2_eod.second) ||
-        (e2_accept.second && e1_eod.second)) {
+    if ((e1_accept && e2_eod) || (e2_accept && e1_eod)) {
         if (h1[h1.start].reports != h2[h2.start].reports) {
             return false;
         }
@@ -535,11 +530,9 @@ bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) {
 }
 
 /** Merge graph \p ga into graph \p gb. Returns false on failure. */
-bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
+bool mergeNfaPair(const NGHolder &ga, NGHolder &gb, const ReportManager *rm,
                   const CompileContext &cc) {
     assert(ga.kind == gb.kind);
-    auto a_state_ids = numberStates(ga);
-    auto b_state_ids = numberStates(gb);
 
     // Vacuous NFAs require special checks on their starts to ensure that tops
     // match, and that reports match for mixed-accept cases.
@@ -548,29 +541,26 @@ bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
         return false;
     }
 
-    u32 cpl = commonPrefixLength(ga, a_state_ids, gb, b_state_ids);
-    if (!shouldMerge(gb, b_state_ids, ga, a_state_ids, cpl, rm, cc)) {
+    u32 cpl = commonPrefixLength(ga, gb);
+    if (!shouldMerge(gb, ga, cpl, rm, cc)) {
         return false;
     }
 
     mergeNfaComponent(gb, ga, cpl);
     reduceImplementableGraph(gb, SOM_NONE, rm, cc);
-    b_state_ids = numberStates(gb);
     return true;
 }
 
-/** Merge the group of graphs in \p cluster where possible. The (from, to)
- * mapping of merged graphs is returned in \p merged. */
-void mergeNfaCluster(const vector<NGHolder *> &cluster,
-                     const ReportManager *rm,
-                     map<NGHolder *, NGHolder *> &merged,
-                     const CompileContext &cc) {
+map<NGHolder *, NGHolder *> mergeNfaCluster(const vector<NGHolder *> &cluster,
+                                            const ReportManager *rm,
+                                            const CompileContext &cc) {
+    map<NGHolder *, NGHolder *> merged;
+
     if (cluster.size() < 2) {
-        return;
+        return merged;
     }
 
     DEBUG_PRINTF("new cluster, size %zu\n", cluster.size());
-    merged.clear();
 
     priority_queue<NfaMergeCandidateH> pq;
     buildNfaMergeQueue(cluster, &pq);
@@ -599,6 +589,8 @@ void mergeNfaCluster(const vector<NGHolder *> &cluster,
             }
         }
     }
+
+    return merged;
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_uncalc_components.h b/src/nfagraph/ng_uncalc_components.h
index 5f341961..b0f42670 100644
--- a/src/nfagraph/ng_uncalc_components.h
+++ b/src/nfagraph/ng_uncalc_components.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,9 +36,6 @@
 #include <map>
 #include <vector>
 
-#include "nfagraph/ng_graph.h"
-#include "util/ue2_containers.h"
-
 namespace ue2 {
 
 struct CompileContext;
@@ -52,20 +49,16 @@ class ReportManager;
  * The CPL is calculated based the topological ordering given by the state
  * indices for each graph.
  */
-u32 commonPrefixLength(const NGHolder &ga,
-                       const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                       const NGHolder &gb,
-                       const ue2::unordered_map<NFAVertex, u32> &b_state_ids);
+u32 commonPrefixLength(const NGHolder &ga, const NGHolder &gb);
 
 /**
  * \brief Merge the group of graphs in \p cluster where possible.
  *
- * The (from, to) mapping of merged graphs is returned in \p merged.
+ * The (from, to) mapping of merged graphs is returned.
  */
-void mergeNfaCluster(const std::vector<NGHolder *> &cluster,
-                     const ReportManager *rm,
-                     std::map<NGHolder *, NGHolder *> &merged,
-                     const CompileContext &cc);
+std::map<NGHolder *, NGHolder *>
+mergeNfaCluster(const std::vector<NGHolder *> &cluster, const ReportManager *rm,
+                const CompileContext &cc);
 
 /**
  * \brief Merge graph \p ga into graph \p gb.
@@ -73,7 +66,7 @@ void mergeNfaCluster(const std::vector<NGHolder *> &cluster,
  * Returns false on failure. On success, \p gb is reduced via \ref
  * reduceImplementableGraph and renumbered.
  */
-bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
+bool mergeNfaPair(const NGHolder &ga, NGHolder &gb, const ReportManager *rm,
                   const CompileContext &cc);
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_undirected.h b/src/nfagraph/ng_undirected.h
index 12632e05..7df6c7dc 100644
--- a/src/nfagraph/ng_undirected.h
+++ b/src/nfagraph/ng_undirected.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,10 @@
 #include "util/graph_range.h"
 #include "util/ue2_containers.h"
 
+#include <vector>
+
+#include <boost/graph/adjacency_list.hpp>
+
 namespace ue2 {
 
 /**
@@ -51,7 +55,7 @@ namespace ue2 {
 typedef boost::adjacency_list<boost::setS,        // out edges
                               boost::listS,       // vertices
                               boost::undirectedS, // graph is undirected
-                              boost::property<boost::vertex_index_t, u32> >
+                              boost::property<boost::vertex_index_t, size_t> >
 NFAUndirectedGraph;
 
 typedef NFAUndirectedGraph::vertex_descriptor NFAUndirectedVertex;
@@ -60,16 +64,18 @@ typedef NFAUndirectedGraph::vertex_descriptor NFAUndirectedVertex;
  * Make a copy of an NFAGraph with undirected edges, optionally without start
  * vertices. Mappings from the original graph to the new one are provided.
  *
- * Note that new vertex indices are assigned contiguously in \a vertices(g) order.
+ * Note that new vertex indices are assigned contiguously in \a vertices(g)
+ * order.
  */
 template <typename GraphT>
 void createUnGraph(const GraphT &g,
-                   bool excludeStarts,
-                   bool excludeAccepts,
-                   NFAUndirectedGraph &ug,
-                   ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
-                   ue2::unordered_map<u32, NFAVertex> &newIdx2old) {
-    u32 idx = 0;
+           bool excludeStarts,
+           bool excludeAccepts,
+           NFAUndirectedGraph &ug,
+           ue2::unordered_map<typename GraphT::vertex_descriptor,
+                              NFAUndirectedVertex> &old2new) {
+    size_t idx = 0;
+    typedef typename GraphT::vertex_descriptor VertexT;
 
     for (auto v : ue2::vertices_range(g)) {
         // skip all accept nodes
@@ -84,13 +90,12 @@ void createUnGraph(const GraphT &g,
 
         NFAUndirectedVertex nuv = boost::add_vertex(ug);
         old2new[v] = nuv;
-        newIdx2old[idx] = v;
         boost::put(boost::vertex_index, ug, nuv, idx++);
     }
 
     for (const auto &e : ue2::edges_range(g)) {
-        NFAVertex src = source(e, g);
-        NFAVertex targ = target(e, g);
+        VertexT src = source(e, g);
+        VertexT targ = target(e, g);
 
         if ((excludeAccepts && is_any_accept(src, g))
             || (excludeStarts && is_any_start(src, g))) {
diff --git a/src/nfagraph/ng_utf8.cpp b/src/nfagraph/ng_utf8.cpp
index 719e42e2..383aa142 100644
--- a/src/nfagraph/ng_utf8.cpp
+++ b/src/nfagraph/ng_utf8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -176,7 +176,7 @@ void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
             continue;
         }
 
-        DEBUG_PRINTF("%u is a seed\n", h[v].index);
+        DEBUG_PRINTF("%zu is a seed\n", h[v].index);
         seeds->push_back(v);
         already_seeds.insert(v);
     }
@@ -184,13 +184,12 @@ void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
 
 static
 bool expandCyclic(NGHolder &h, NFAVertex v) {
-    DEBUG_PRINTF("inspecting %u\n", h[v].index);
+    DEBUG_PRINTF("inspecting %zu\n", h[v].index);
     bool changes = false;
 
-    set<NFAVertex> v_preds;
-    set<NFAVertex> v_succs;
-    pred(h, v, &v_preds);
-    succ(h, v, &v_succs);
+    auto v_preds = preds(v, h);
+    auto v_succs = succs(v, h);
+
     set<NFAVertex> start_siblings;
     set<NFAVertex> end_siblings;
 
@@ -199,11 +198,10 @@ bool expandCyclic(NGHolder &h, NFAVertex v) {
     /* We need to find start vertices which have all of our preds.
      * As we have a self loop, it must be one of our succs. */
     for (auto a : adjacent_vertices_range(v, h)) {
-        set<NFAVertex> a_preds;
-        pred(h, a, &a_preds);
+        auto a_preds = preds(a, h);
 
         if (a_preds == v_preds && isutf8start(h[a].char_reach)) {
-            DEBUG_PRINTF("%u is a start v\n", h[a].index);
+            DEBUG_PRINTF("%zu is a start v\n", h[a].index);
             start_siblings.insert(a);
         }
     }
@@ -211,11 +209,10 @@ bool expandCyclic(NGHolder &h, NFAVertex v) {
     /* We also need to find full cont vertices which have all our own succs;
      * As we have a self loop, it must be one of our preds. */
     for (auto a : inv_adjacent_vertices_range(v, h)) {
-        set<NFAVertex> a_succs;
-        succ(h, a, &a_succs);
+        auto a_succs = succs(a, h);
 
         if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) {
-            DEBUG_PRINTF("%u is a full tail cont\n", h[a].index);
+            DEBUG_PRINTF("%zu is a full tail cont\n", h[a].index);
             end_siblings.insert(a);
         }
     }
@@ -229,7 +226,7 @@ bool expandCyclic(NGHolder &h, NFAVertex v) {
         if (cr.isSubsetOf(UTF_TWO_START_CR)) {
             if (end_siblings.find(*adjacent_vertices(s, h).first)
                 == end_siblings.end()) {
-                DEBUG_PRINTF("%u is odd\n", h[s].index);
+                DEBUG_PRINTF("%zu is odd\n", h[s].index);
                 continue;
             }
         } else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
@@ -241,7 +238,7 @@ bool expandCyclic(NGHolder &h, NFAVertex v) {
             }
             if (end_siblings.find(*adjacent_vertices(m, h).first)
                 == end_siblings.end()) {
-                DEBUG_PRINTF("%u is odd\n", h[s].index);
+                DEBUG_PRINTF("%zu is odd\n", h[s].index);
                 continue;
             }
         } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
@@ -261,11 +258,11 @@ bool expandCyclic(NGHolder &h, NFAVertex v) {
 
             if (end_siblings.find(*adjacent_vertices(m2, h).first)
                 == end_siblings.end()) {
-                DEBUG_PRINTF("%u is odd\n", h[s].index);
+                DEBUG_PRINTF("%zu is odd\n", h[s].index);
                 continue;
             }
         } else {
-            DEBUG_PRINTF("%u is bad\n", h[s].index);
+            DEBUG_PRINTF("%zu is bad\n", h[s].index);
           continue;
         }
 
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index c629d553..5252eb18 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -46,15 +46,13 @@
 #include <map>
 #include <set>
 #include <boost/graph/filtered_graph.hpp>
-#include <boost/graph/strong_components.hpp>
 #include <boost/graph/topological_sort.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
 using boost::default_color_type;
-using boost::filtered_graph;
+using boost::make_filtered_graph;
 using boost::make_assoc_property_map;
-using boost::adaptors::map_values;
 
 namespace ue2 {
 
@@ -146,7 +144,7 @@ void clone_out_edges(NGHolder &g, NFAVertex source, NFAVertex dest) {
         if (edge(dest, t, g).second) {
             continue;
         }
-        NFAEdge clone = add_edge(dest, t, g).first;
+        NFAEdge clone = add_edge(dest, t, g);
         u32 idx = g[clone].index;
         g[clone] = g[e];
         g[clone].index = idx;
@@ -157,7 +155,7 @@ void clone_in_edges(NGHolder &g, NFAVertex s, NFAVertex dest) {
     for (const auto &e : in_edges_range(s, g)) {
         NFAVertex ss = source(e, g);
         assert(!edge(ss, dest, g).second);
-        NFAEdge clone = add_edge(ss, dest, g).first;
+        NFAEdge clone = add_edge(ss, dest, g);
         u32 idx = g[clone].index;
         g[clone] = g[e];
         g[clone].index = idx;
@@ -165,27 +163,21 @@ void clone_in_edges(NGHolder &g, NFAVertex s, NFAVertex dest) {
 }
 
 bool onlyOneTop(const NGHolder &g) {
-    set<u32> tops;
-    for (const auto &e : out_edges_range(g.start, g)) {
-        tops.insert(g[e].top);
-    }
-    assert(!tops.empty());
-    return tops.size() == 1;
+    return getTops(g).size() == 1;
 }
 
 namespace {
 struct CycleFound {};
 struct DetectCycles : public boost::default_dfs_visitor {
     explicit DetectCycles(const NGHolder &g) : startDs(g.startDs) {}
-    void back_edge(const NFAEdge &e, const NFAGraph &g) const {
+    void back_edge(const NFAEdge &e, const NGHolder &g) const {
         NFAVertex u = source(e, g), v = target(e, g);
         // We ignore the startDs self-loop.
         if (u == startDs && v == startDs) {
             return;
         }
         // Any other back-edge indicates a cycle.
-        DEBUG_PRINTF("back edge %u->%u found\n", g[u].index,
-                     g[v].index);
+        DEBUG_PRINTF("back edge %zu->%zu found\n", g[u].index, g[v].index);
         throw CycleFound();
     }
 private:
@@ -220,10 +212,8 @@ bool isFloating(const NGHolder &g) {
 
 bool isAcyclic(const NGHolder &g) {
     try {
-        depth_first_search(
-            g.g, visitor(DetectCycles(g))
-                     .root_vertex(g.start)
-                     .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
+        boost::depth_first_search(g, visitor(DetectCycles(g))
+                                     .root_vertex(g.start));
     } catch (const CycleFound &) {
         return false;
     }
@@ -239,11 +229,11 @@ bool hasReachableCycle(const NGHolder &g, NFAVertex src) {
     try {
         // Use depth_first_visit, rather than depth_first_search, so that we
         // only search from src.
-        auto index_map = get(&NFAGraphVertexProps::index, g.g);
-        depth_first_visit(
-            g.g, src, DetectCycles(g),
-            make_iterator_property_map(colors.begin(), index_map));
-    } catch (const CycleFound&) {
+        auto index_map = get(vertex_index, g);
+        boost::depth_first_visit(g, src, DetectCycles(g),
+                                 make_iterator_property_map(colors.begin(),
+                                                            index_map));
+    } catch (const CycleFound &) {
         return true;
     }
 
@@ -254,10 +244,7 @@ bool hasBigCycles(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     set<NFAEdge> dead;
     BackEdges<set<NFAEdge>> backEdgeVisitor(dead);
-    depth_first_search(
-        g.g, visitor(backEdgeVisitor)
-                 .root_vertex(g.start)
-                 .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
+    boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
 
     for (const auto &e : dead) {
         if (source(e, g) != target(e, g)) {
@@ -268,43 +255,9 @@ bool hasBigCycles(const NGHolder &g) {
     return false;
 }
 
-set<NFAVertex> findVerticesInCycles(const NGHolder &g) {
-    map<NFAVertex, size_t> comp_map;
-
-    strong_components(g.g, make_assoc_property_map(comp_map),
-                      vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
-
-    map<size_t, set<NFAVertex> > comps;
-
-    for (const auto &e : comp_map) {
-        comps[e.second].insert(e.first);
-    }
-
-
-    set<NFAVertex> rv;
-
-    for (const auto &comp : comps | map_values) {
-        /* every vertex in a strongly connected component is reachable from
-         * every other vertex in the component. A vertex is involved in a cycle
-         * therefore if it is in a strongly connected component with more than
-         * one vertex or if it is the only vertex and it has a self loop. */
-        assert(!comp.empty());
-        if (comp.size() > 1) {
-            insert(&rv, comp);
-        }
-        NFAVertex v = *comp.begin();
-        if (hasSelfLoop(v, g)) {
-            rv.insert(v);
-        }
-    }
-
-    return rv;
-}
-
 bool can_never_match(const NGHolder &g) {
     assert(edge(g.accept, g.acceptEod, g).second);
-    if (!hasGreaterInDegree(0, g.accept, g)
-        && !hasGreaterInDegree(1, g.acceptEod, g)) {
+    if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
         DEBUG_PRINTF("no paths into accept\n");
         return true;
     }
@@ -313,7 +266,7 @@ bool can_never_match(const NGHolder &g) {
 }
 
 bool can_match_at_eod(const NGHolder &h) {
-    if (hasGreaterInDegree(1, h.acceptEod, h)) {
+    if (in_degree(h.acceptEod, h) > 1) {
         DEBUG_PRINTF("more than one edge to acceptEod\n");
         return true;
     }
@@ -337,17 +290,56 @@ bool can_only_match_at_eod(const NGHolder &g) {
 }
 
 bool matches_everywhere(const NGHolder &h) {
-    NFAEdge e;
-    bool exists;
-    tie(e, exists) = edge(h.startDs, h.accept, h);
+    NFAEdge e = edge(h.startDs, h.accept, h);
 
-    return exists && !h[e].assert_flags;
+    return e && !h[e].assert_flags;
 }
 
 bool is_virtual_start(NFAVertex v, const NGHolder &g) {
     return g[v].assert_flags & POS_FLAG_VIRTUAL_START;
 }
 
+static
+void reorderSpecials(const NGHolder &g, vector<NFAVertex> &topoOrder) {
+    // Start is last element of reverse topo ordering.
+    auto it = find(topoOrder.begin(), topoOrder.end(), g.start);
+    if (it != topoOrder.end() - 1) {
+        DEBUG_PRINTF("repositioning start\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.end(), g.start);
+    }
+
+    // StartDs is second-to-last element of reverse topo ordering.
+    it = find(topoOrder.begin(), topoOrder.end(), g.startDs);
+    if (it != topoOrder.end() - 2) {
+        DEBUG_PRINTF("repositioning start ds\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.end() - 1, g.startDs);
+    }
+
+    // AcceptEOD is first element of reverse topo ordering.
+    it = find(topoOrder.begin(), topoOrder.end(), g.acceptEod);
+    if (it != topoOrder.begin()) {
+        DEBUG_PRINTF("repositioning accept\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.begin(), g.acceptEod);
+    }
+
+    // Accept is second element of reverse topo ordering, if it's connected.
+    it = find(topoOrder.begin(), topoOrder.end(), g.accept);
+    if (it != topoOrder.begin() + 1) {
+        DEBUG_PRINTF("repositioning accept\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        if (in_degree(g.accept, g) != 0) {
+            topoOrder.insert(topoOrder.begin() + 1, g.accept);
+        }
+    }
+}
+
 vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
 
@@ -360,22 +352,19 @@ vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
     EdgeSet backEdges;
     BackEdges<EdgeSet> be(backEdges);
 
-    auto index_map = get(&NFAGraphVertexProps::index, g.g);
-    depth_first_search(g.g, visitor(be)
-                                .root_vertex(g.start)
-                                .color_map(make_iterator_property_map(
-                                    colour.begin(), index_map))
-                                .vertex_index_map(index_map));
+    auto index_map = get(vertex_index, g);
+    depth_first_search(g, visitor(be).root_vertex(g.start)
+                                     .color_map(make_iterator_property_map(
+                                                colour.begin(), index_map)));
 
-    AcyclicFilter<EdgeSet> af(&be.backEdges);
-    filtered_graph<NFAGraph, AcyclicFilter<EdgeSet>> acyclic_g(g.g, af);
+    auto acyclic_g = make_filtered_graph(g, make_bad_edge_filter(&backEdges));
 
     vector<NFAVertex> ordering;
     ordering.reserve(num_verts);
-    topological_sort(
-        acyclic_g, back_inserter(ordering),
-        color_map(make_iterator_property_map(colour.begin(), index_map))
-            .vertex_index_map(index_map));
+    topological_sort(acyclic_g, back_inserter(ordering),
+        color_map(make_iterator_property_map(colour.begin(), index_map)));
+
+    reorderSpecials(g, ordering);
 
     return ordering;
 }
@@ -397,14 +386,12 @@ void mustBeSetBefore_int(NFAVertex u, const NGHolder &g,
         }
     }
 
-    // The AcyclicFilter is badly named, it's really just an edge-set filter.
-    filtered_graph<NFAGraph, AcyclicFilter<set<NFAEdge>>> prefix(g.g,
-                                    AcyclicFilter<set<NFAEdge>>(&dead));
+    auto prefix = make_filtered_graph(g, make_bad_edge_filter(&dead));
 
     depth_first_visit(
         prefix, g.start, make_dfs_visitor(boost::null_visitor()),
         make_iterator_property_map(vertexColor.begin(),
-                                   get(&NFAGraphVertexProps::index, g.g)));
+                                   get(vertex_index, g)));
 }
 
 bool mustBeSetBefore(NFAVertex u, NFAVertex v, const NGHolder &g,
@@ -421,15 +408,14 @@ bool mustBeSetBefore(NFAVertex u, NFAVertex v, const NGHolder &g,
     mustBeSetBefore_int(u, g, vertexColor);
 
     for (auto vi : vertices_range(g)) {
-        auto key2 = make_pair(g[u].index,
-                              g[vi].index);
-        DEBUG_PRINTF("adding %u %u\n", key2.first, key2.second);
+        auto key2 = make_pair(g[u].index, g[vi].index);
+        DEBUG_PRINTF("adding %zu %zu\n", key2.first, key2.second);
         assert(!contains(cache.cache, key2));
         bool value = vertexColor[g[vi].index] == boost::white_color;
         cache.cache[key2] = value;
         assert(contains(cache.cache, key2));
     }
-    DEBUG_PRINTF("cache miss %u %u (%zu)\n", key.first, key.second,
+    DEBUG_PRINTF("cache miss %zu %zu (%zu)\n", key.first, key.second,
                  cache.cache.size());
     return cache.cache[key];
 }
@@ -465,17 +451,21 @@ void appendLiteral(NGHolder &h, const ue2_literal &s) {
 ue2::flat_set<u32> getTops(const NGHolder &h) {
     ue2::flat_set<u32> tops;
     for (const auto &e : out_edges_range(h.start, h)) {
-        NFAVertex v = target(e, h);
-        if (v == h.startDs) {
-            continue;
-        }
-        u32 top = h[e].top;
-        assert(top < NFA_MAX_TOP_MASKS);
-        tops.insert(top);
+        insert(&tops, h[e].tops);
     }
     return tops;
 }
 
+void setTops(NGHolder &h, u32 top) {
+    for (const auto &e : out_edges_range(h.start, h)) {
+        assert(h[e].tops.empty());
+        if (target(e, h) == h.startDs) {
+            continue;
+        }
+        h[e].tops.insert(top);
+    }
+}
+
 void clearReports(NGHolder &g) {
     DEBUG_PRINTF("clearing reports without an accept edge\n");
     ue2::unordered_set<NFAVertex> allow;
@@ -553,12 +543,13 @@ void fillHolder(NGHolder *outp, const NGHolder &in, const deque<NFAVertex> &vv,
         fillHolderOutEdges(out, in, v_map, u);
     }
 
-    out.renumberEdges();
-    out.renumberVertices();
+    renumber_edges(out);
+    renumber_vertices(out);
 }
 
 void cloneHolder(NGHolder &out, const NGHolder &in) {
     assert(hasCorrectlyNumberedVertices(in));
+    assert(hasCorrectlyNumberedVertices(out));
     out.kind = in.kind;
 
     // Note: depending on the state of the input graph, some stylized edges
@@ -568,6 +559,7 @@ void cloneHolder(NGHolder &out, const NGHolder &in) {
     /* remove the existing special edges */
     clear_vertex(out.startDs, out);
     clear_vertex(out.accept, out);
+    renumber_edges(out);
 
     vector<NFAVertex> out_mapping(num_vertices(in));
     out_mapping[NODE_START] = out.start;
@@ -595,16 +587,13 @@ void cloneHolder(NGHolder &out, const NGHolder &in) {
 
         NFAVertex s = out_mapping[si];
         NFAVertex t = out_mapping[ti];
-        UNUSED bool added;
-        NFAEdge e2;
-        tie(e2, added) = add_edge(s, t, out);
-        assert(added);
+        NFAEdge e2 = add_edge(s, t, out);
         out[e2] = in[e];
     }
 
     // Safety checks.
-    assert(num_vertices(in.g) == num_vertices(out.g));
-    assert(num_edges(in.g) == num_edges(out.g));
+    assert(num_vertices(in) == num_vertices(out));
+    assert(num_edges(in) == num_edges(out));
     assert(hasCorrectlyNumberedVertices(out));
 }
 
@@ -630,14 +619,66 @@ unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
     return h;
 }
 
+void reverseHolder(const NGHolder &g_in, NGHolder &g) {
+    // Make the BGL do the grunt work.
+    ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
+    boost::transpose_graph(g_in, g,
+                orig_to_copy(boost::make_assoc_property_map(vertexMap)));
+
+    // The transpose_graph operation will have created extra copies of our
+    // specials. We have to rewire their neighbours to the 'real' specials and
+    // delete them.
+    NFAVertex start = vertexMap[g_in.acceptEod];
+    NFAVertex startDs = vertexMap[g_in.accept];
+    NFAVertex accept = vertexMap[g_in.startDs];
+    NFAVertex acceptEod = vertexMap[g_in.start];
+
+    // Successors of starts.
+    for (const auto &e : out_edges_range(start, g)) {
+        NFAVertex v = target(e, g);
+        add_edge(g.start, v, g[e], g);
+    }
+    for (const auto &e : out_edges_range(startDs, g)) {
+        NFAVertex v = target(e, g);
+        add_edge(g.startDs, v, g[e], g);
+    }
+
+    // Predecessors of accepts.
+    for (const auto &e : in_edges_range(accept, g)) {
+        NFAVertex u = source(e, g);
+        add_edge(u, g.accept, g[e], g);
+    }
+    for (const auto &e : in_edges_range(acceptEod, g)) {
+        NFAVertex u = source(e, g);
+        add_edge(u, g.acceptEod, g[e], g);
+    }
+
+    // Remove our impostors.
+    clear_vertex(start, g);
+    remove_vertex(start, g);
+    clear_vertex(startDs, g);
+    remove_vertex(startDs, g);
+    clear_vertex(accept, g);
+    remove_vertex(accept, g);
+    clear_vertex(acceptEod, g);
+    remove_vertex(acceptEod, g);
+
+    // Renumber so that g's properties (number of vertices, edges) are
+    // accurate.
+    renumber_vertices(g);
+    renumber_edges(g);
+
+    assert(num_vertices(g) == num_vertices(g_in));
+    assert(num_edges(g) == num_edges(g_in));
+}
+
 #ifndef NDEBUG
 
 bool allMatchStatesHaveReports(const NGHolder &g) {
     unordered_set<NFAVertex> reporters;
     for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
         if (g[v].reports.empty()) {
-            DEBUG_PRINTF("vertex %u has no reports!\n",
-                         g[v].index);
+            DEBUG_PRINTF("vertex %zu has no reports!\n", g[v].index);
             return false;
         }
         reporters.insert(v);
@@ -648,8 +689,7 @@ bool allMatchStatesHaveReports(const NGHolder &g) {
             continue; // stylised edge
         }
         if (g[v].reports.empty()) {
-            DEBUG_PRINTF("vertex %u has no reports!\n",
-                         g[v].index);
+            DEBUG_PRINTF("vertex %zu has no reports!\n", g[v].index);
             return false;
         }
         reporters.insert(v);
@@ -657,7 +697,7 @@ bool allMatchStatesHaveReports(const NGHolder &g) {
 
     for (auto v : vertices_range(g)) {
         if (!contains(reporters, v) && !g[v].reports.empty()) {
-            DEBUG_PRINTF("vertex %u is not a match state, but has reports!\n",
+            DEBUG_PRINTF("vertex %zu is not a match state, but has reports!\n",
                          g[v].index);
             return false;
         }
@@ -666,32 +706,22 @@ bool allMatchStatesHaveReports(const NGHolder &g) {
     return true;
 }
 
-bool hasCorrectlyNumberedVertices(const NGHolder &g) {
-    size_t count = num_vertices(g);
-    vector<bool> ids(count, false);
-    for (auto v : vertices_range(g)) {
-        u32 id = g[v].index;
-        if (id >= count || ids[id]) {
-            return false; // duplicate
+bool isCorrectlyTopped(const NGHolder &g) {
+    if (is_triggered(g)) {
+        for (const auto &e : out_edges_range(g.start, g)) {
+            if (g[e].tops.empty() != (target(e, g) == g.startDs)) {
+                return false;
+            }
+        }
+    } else {
+        for (const auto &e : out_edges_range(g.start, g)) {
+            if (!g[e].tops.empty()) {
+                return false;
+            }
         }
-        ids[id] = true;
     }
-    return find(ids.begin(), ids.end(), false) == ids.end()
-        && num_vertices(g) == num_vertices(g.g);
-}
 
-bool hasCorrectlyNumberedEdges(const NGHolder &g) {
-    size_t count = num_edges(g);
-    vector<bool> ids(count, false);
-    for (const auto &e : edges_range(g)) {
-        u32 id = g[e].index;
-        if (id >= count || ids[id]) {
-            return false; // duplicate
-        }
-        ids[id] = true;
-    }
-    return find(ids.begin(), ids.end(), false) == ids.end()
-        && num_edges(g) == num_edges(g.g);
+    return true;
 }
 
 #endif // NDEBUG
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 4f58dc45..a0752533 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -65,18 +65,30 @@ bool is_dot(NFAVertex v, const GraphT &g) {
 template<class U>
 static really_inline
 void succ(const NGHolder &g, NFAVertex v, U *s) {
-    NGHolder::adjacency_iterator ai, ae;
-    tie(ai, ae) = adjacent_vertices(v, g);
-    s->insert(ai, ae);
+    auto rv = adjacent_vertices(v, g);
+    s->insert(rv.first, rv.second);
+}
+
+template<class ContTemp = flat_set<NFAVertex>>
+ContTemp succs(NFAVertex u, const NGHolder &g) {
+    ContTemp rv;
+    succ(g, u, &rv);
+    return rv;
 }
 
 /** adds predecessors of v to s */
 template<class U>
 static really_inline
 void pred(const NGHolder &g, NFAVertex v, U *p) {
-    NGHolder::inv_adjacency_iterator it, ite;
-    tie(it, ite) = inv_adjacent_vertices(v, g);
-    p->insert(it, ite);
+    auto rv = inv_adjacent_vertices(v, g);
+    p->insert(rv.first, rv.second);
+}
+
+template<class ContTemp = flat_set<NFAVertex>>
+ContTemp preds(NFAVertex u, const NGHolder &g) {
+    ContTemp rv;
+    pred(g, u, &rv);
+    return rv;
 }
 
 /** returns a vertex with an out edge from v and is not v.
@@ -88,6 +100,30 @@ NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex v);
 /** Like getSoleDestVertex but for in-edges */
 NFAVertex getSoleSourceVertex(const NGHolder &g, NFAVertex v);
 
+/** \brief edge filtered graph.
+ *
+ * This will give you a view over the graph that has none of the edges from
+ * the provided set included.
+ *
+ * If this is provided with the back edges of the graph, this will result in an
+ * acyclic subgraph view. This is useful for topological_sort and other
+ * algorithms that require a DAG.
+ */
+template<typename EdgeSet>
+struct bad_edge_filter {
+    bad_edge_filter() {}
+    explicit bad_edge_filter(const EdgeSet *bad_e) : bad_edges(bad_e) {}
+    bool operator()(const typename EdgeSet::value_type &e) const {
+        return !contains(*bad_edges, e); /* keep edges not in the bad set */
+    }
+    const EdgeSet *bad_edges = nullptr;
+};
+
+template<typename EdgeSet>
+bad_edge_filter<EdgeSet> make_bad_edge_filter(const EdgeSet *e) {
+    return bad_edge_filter<EdgeSet>(e);
+}
+
 /** Visitor that records back edges */
 template <typename BackEdgeSet>
 class BackEdges : public boost::default_dfs_visitor {
@@ -100,59 +136,11 @@ public:
     BackEdgeSet &backEdges;
 };
 
-/** \brief Acyclic filtered graph.
- *
- * This will give you a view over the graph that is directed and acyclic:
- * useful for topological_sort and other algorithms that require a DAG.
- */
-template <typename BackEdgeSet>
-struct AcyclicFilter {
-    AcyclicFilter() {}
-    explicit AcyclicFilter(const BackEdgeSet *edges) : backEdges(edges) {}
-    template <typename EdgeT>
-    bool operator()(const EdgeT &e) const {
-        // Only keep edges that aren't in the back edge set.
-        return (backEdges->find(e) == backEdges->end());
-    }
-    const BackEdgeSet *backEdges = nullptr;
-};
-
-/**
- * Generic code to renumber all the vertices in a graph. Assumes that we're
- * using a vertex_index property of type u32, and that we always have
- * N_SPECIALS special vertices already present (which we don't want to
- * renumber).
- */
-template<typename GraphT>
-static really_inline
-size_t renumberGraphVertices(GraphT &g) {
-    size_t num = N_SPECIALS;
-    for (const auto &v : vertices_range(g)) {
-        if (!is_special(v, g)) {
-            g[v].index = num++;
-            assert(num > 0); // no wrapping
-        }
-    }
-    return num;
-}
-
-/** Renumber all the edges in a graph. */
-template<typename GraphT>
-static really_inline
-size_t renumberGraphEdges(GraphT &g) {
-    size_t num = 0;
-    for (const auto &e : edges_range(g)) {
-        g[e].index = num++;
-        assert(num > 0); // no wrapping
-    }
-    return num;
-}
-
 /** Returns true if the vertex is either of the real starts (NODE_START,
  *  NODE_START_DOTSTAR). */
 template <typename GraphT>
 static really_inline
-bool is_any_start(const NFAVertex v, const GraphT &g) {
+bool is_any_start(typename GraphT::vertex_descriptor v, const GraphT &g) {
     u32 i = g[v].index;
     return i == NODE_START || i == NODE_START_DOTSTAR;
 }
@@ -160,47 +148,34 @@ bool is_any_start(const NFAVertex v, const GraphT &g) {
 bool is_virtual_start(NFAVertex v, const NGHolder &g);
 
 template <typename GraphT>
-static really_inline
-bool is_any_accept(const NFAVertex v, const GraphT &g) {
+bool is_any_accept(typename GraphT::vertex_descriptor v, const GraphT &g) {
     u32 i = g[v].index;
     return i == NODE_ACCEPT || i == NODE_ACCEPT_EOD;
 }
 
 /** returns true iff v has an edge to accept or acceptEod */
 template <typename GraphT>
-static really_inline
-bool is_match_vertex(NFAVertex v, const GraphT &g) {
+bool is_match_vertex(typename GraphT::vertex_descriptor v, const GraphT &g) {
     return edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second;
 }
 
 /** Generate a reverse topological ordering for a back-edge filtered version of
- * our graph (as it must be a DAG and correctly numbered) */
+ * our graph (as it must be a DAG and correctly numbered).
+ *
+ * Note: we ensure that we produce a topo ordering that begins with acceptEod
+ * and accept (if present) and ends with startDs followed by start.
+ */
 std::vector<NFAVertex> getTopoOrdering(const NGHolder &g);
 
-/** Comparison functor used to sort by vertex_index. */
-template<typename Graph>
-struct VertexIndexOrdering {
-    VertexIndexOrdering(const Graph &g_in) : g(&g_in) {}
-    bool operator()(typename Graph::vertex_descriptor a,
-                    typename Graph::vertex_descriptor b) const {
-        assert(a == b || (*g)[a].index != (*g)[b].index);
-        return (*g)[a].index < (*g)[b].index;
-    }
-private:
-    const Graph *g;
-};
-
-template<typename Graph>
-static
-VertexIndexOrdering<Graph> make_index_ordering(const Graph &g) {
-    return VertexIndexOrdering<Graph>(g);
-}
-
 bool onlyOneTop(const NGHolder &g);
 
-/** Return a mask of the tops on the given graph. */
+/** Return the set of the tops on the given graph. */
 flat_set<u32> getTops(const NGHolder &h);
 
+/** Initialise the tops on h to the provide top. Assumes that h is triggered and
+ * no tops have been set on h. */
+void setTops(NGHolder &h, u32 top = DEFAULT_TOP);
+
 /** adds a vertex to g with all the same vertex properties as \p v (aside from
  * index) */
 NFAVertex clone_vertex(NGHolder &g, NFAVertex v);
@@ -296,6 +271,10 @@ void clearReports(NGHolder &g);
  * r_old. */
 void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 
+/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
+ * accepts. */
+void reverseHolder(const NGHolder &g, NGHolder &out);
+
 #ifndef NDEBUG
 
 // Assertions: only available in internal builds.
@@ -308,17 +287,11 @@ void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 bool allMatchStatesHaveReports(const NGHolder &g);
 
 /**
- * Assertion: returns true if the vertices in this graph are contiguously (and
- * uniquely) numbered from zero.
+ * Assertion: returns true if the graph is triggered and all edges out of start
+ * have tops OR if the graph is not-triggered and all edges out of start have no
+ * tops.
  */
-bool hasCorrectlyNumberedVertices(const NGHolder &g);
-
-/**
- * Assertion: returns true if the edges in this graph are contiguously (and
- * uniquely) numbered from zero.
- */
-bool hasCorrectlyNumberedEdges(const NGHolder &g);
-
+bool isCorrectlyTopped(const NGHolder &g);
 #endif // NDEBUG
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 94e0a998..985246f0 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -67,8 +67,6 @@
 #include <utility>
 #include <vector>
 #include <boost/core/noncopyable.hpp>
-#include <boost/graph/reverse_graph.hpp>
-#include <boost/graph/topological_sort.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 #define STAGE_DEBUG_PRINTF DEBUG_PRINTF
@@ -466,7 +464,7 @@ void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
         DEBUG_PRINTF("inspecting region %u\n", region);
         set<ue2_literal> s;
         for (auto v : vv) {
-            DEBUG_PRINTF("   exit vertex: %u\n", g[v].index);
+            DEBUG_PRINTF("   exit vertex: %zu\n", g[v].index);
             /* Note: RHS can not be depended on to take all subsequent revisits
              * to this vertex */
             set<ue2_literal> ss = getLiteralSet(g, v, false);
@@ -671,7 +669,7 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
         lits.pop_back();
     }
 
-    DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
+    DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
         dumpString(*best->lit.begin()).c_str(),
         g[best->vv.front()].index,
         depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
@@ -779,7 +777,7 @@ set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
     set<NFAVertex> bad_vertices;
     for (const NFAEdge &e : bad_edges) {
         bad_vertices.insert(target(e, h));
-        DEBUG_PRINTF("bad: %u->%u\n", h[source(e, h)].index,
+        DEBUG_PRINTF("bad: %zu->%zu\n", h[source(e, h)].index,
                      h[target(e, h)].index);
     }
 
@@ -1076,8 +1074,10 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
 
     assert(hasCorrectlyNumberedVertices(*rhs));
     assert(hasCorrectlyNumberedEdges(*rhs));
+    assert(isCorrectlyTopped(*rhs));
     assert(hasCorrectlyNumberedVertices(*lhs));
     assert(hasCorrectlyNumberedEdges(*lhs));
+    assert(isCorrectlyTopped(*lhs));
 
     return true;
 }
@@ -1144,7 +1144,7 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             NFAVertex prev_v = source(e, h);
             NFAVertex pivot = target(e, h);
 
-            DEBUG_PRINTF("splitting on pivot %u\n", h[pivot].index);
+            DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
             ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
             shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
             splitLHS(h, pivot, new_lhs.get(), &temp_map);
@@ -1152,7 +1152,11 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             /* want to cut off paths to pivot from things other than the pivot -
              * makes a more svelte graphy */
             clear_in_edges(temp_map[pivot], *new_lhs);
-            add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs);
+            NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot],
+                                          *new_lhs);
+            if (is_triggered(h) && prev_v == h.start) {
+                (*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP);
+            }
 
             pruneUseless(*new_lhs, false);
             renumber_vertices(*new_lhs);
@@ -1162,6 +1166,7 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
 
             assert(hasCorrectlyNumberedVertices(*new_lhs));
             assert(hasCorrectlyNumberedEdges(*new_lhs));
+            assert(isCorrectlyTopped(*new_lhs));
 
             const set<ue2_literal> &lits = cut_lits.at(e);
             for (const auto &lit : lits) {
@@ -1228,6 +1233,7 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
                 DEBUG_PRINTF("    into rhs %s\n",
                               to_string(new_rhs->kind).c_str());
                 done_rhs.emplace(adj, new_rhs);
+                assert(isCorrectlyTopped(*new_rhs));
             }
 
             assert(done_rhs[adj].get());
@@ -1235,6 +1241,7 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
 
             assert(hasCorrectlyNumberedVertices(*new_rhs));
             assert(hasCorrectlyNumberedEdges(*new_rhs));
+            assert(isCorrectlyTopped(*new_rhs));
 
             if (vg[dest].type == RIV_LITERAL
                 && !can_match(*new_rhs, vg[dest].s, true)) {
@@ -1317,7 +1324,7 @@ bool deanchorIfNeeded(NGHolder &g) {
     succ_g.erase(g.startDs);
 
     for (auto v : adjacent_vertices_range(g.start, g)) {
-        DEBUG_PRINTF("inspecting cand %u || = %zu\n", g[v].index,
+        DEBUG_PRINTF("inspecting cand %zu || = %zu\n", g[v].index,
                      g[v].char_reach.count());
 
         if (v == g.startDs || !g[v].char_reach.all()) {
@@ -1380,6 +1387,7 @@ void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
     RoseInEdge e = *edges(vg).first;
 
     NGHolder &h = *vg[e].graph;
+    assert(isCorrectlyTopped(h));
 
     renumber_vertices(h);
     renumber_edges(h);
@@ -1602,6 +1610,7 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
             continue;
         }
 
+        assert(isCorrectlyTopped(*h_new));
         graphs[right] = make_pair(h_new, delay);
     }
 
@@ -1720,6 +1729,8 @@ unique_ptr<NGHolder> make_chain(u32 count) {
     h[u].reports.insert(0);
     add_edge(u, h.accept, h);
 
+    setTops(h);
+
     return rv;
 }
 
@@ -1777,6 +1788,7 @@ bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
         assert(willBeTransient(findMaxWidth(*h_new), cc)
                || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
 
+        assert(isCorrectlyTopped(*h_new));
         graphs[v] = h_new;
     }
 
@@ -1811,6 +1823,7 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
                    const CompileContext &cc) {
     DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
                   num_vertices(h));
+    assert(isCorrectlyTopped(h));
 
     renumber_vertices(h);
     renumber_edges(h);
@@ -1860,6 +1873,7 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
         for (const auto &e : ee) {
             shared_ptr<NGHolder> hh = cloneHolder(h);
             auto succ_lit = vg[target(e, vg)].s;
+            assert(isCorrectlyTopped(*hh));
             u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
                                                     succ_lit.length(),
                                               false /* can't overhang start */);
@@ -1868,6 +1882,7 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
                 continue;
             }
 
+            assert(isCorrectlyTopped(*hh));
             trimmed[hh].emplace_back(e, delay);
         }
 
@@ -2110,10 +2125,15 @@ void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
     add_edge(lhs->accept, lhs->acceptEod, *lhs);
     clearReports(*lhs);
     for (NFAVertex v : splitters) {
-        add_edge(v_map[v], lhs->accept, *lhs);
+        NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs);
+        if (v == base_graph.start) {
+            (*lhs)[e].tops.insert(DEFAULT_TOP);
+        }
         (*lhs)[v_map[v]].reports.insert(0);
+
     }
     pruneUseless(*lhs);
+    assert(isCorrectlyTopped(*lhs));
 
     /* create literal vertices and connect preds */
     for (const auto &lit : split.lit) {
@@ -2319,7 +2339,7 @@ bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
         make_nocase(&lit);
     }
 
-    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+    DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
     out->vv = {v};
     out->lit = {lit};
     return true;
@@ -2448,7 +2468,7 @@ bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
     }
 
     ue2_literal lit = reverse_literal(rv.second);
-    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+    DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
 
     if (bad_mixed_sensitivity(lit)) {
         make_nocase(&lit);
@@ -2652,6 +2672,7 @@ bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
 
     pruneUseless(vg);
     dumpPreRoseGraph(vg, cc.grey);
+    renumber_vertices(vg);
     calcVertexOffsets(vg);
     bool rv = rose.addRose(vg, prefilter);
     DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp
index 470f9343..d596b7b5 100644
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,18 +58,18 @@ namespace {
 struct SpecialEdgeFilter {
     SpecialEdgeFilter() {}
     explicit SpecialEdgeFilter(const NGHolder &h_in) : h(&h_in) {}
-    explicit SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
+    SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
         : h(&h_in), single_top(true), top(top_in) {}
 
     bool operator()(const NFAEdge &e) const {
-        const NFAGraph &g = h->g;
-        NFAVertex u = source(e, g), v = target(e, g);
-        if ((is_any_start(u, g) && is_any_start(v, g)) ||
-            (is_any_accept(u, g) && is_any_accept(v, g))) {
+        NFAVertex u = source(e, *h);
+        NFAVertex v = target(e, *h);
+        if ((is_any_start(u, *h) && is_any_start(v, *h)) ||
+            (is_any_accept(u, *h) && is_any_accept(v, *h))) {
             return false;
         }
         if (single_top) {
-            if (u == h->start && g[e].top != top) {
+            if (u == h->start && !contains((*h)[e].tops, top)) {
                 return false;
             }
             if (u == h->startDs) {
@@ -94,7 +94,7 @@ depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
         return depth::unreachable();
     }
 
-    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
+    boost::filtered_graph<NGHolder, SpecialEdgeFilter> g(h, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -106,11 +106,10 @@ depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
     // Since we are interested in the single-source shortest paths on a graph
     // with the same weight on every edge, using BFS will be faster than
     // Dijkstra here.
-    breadth_first_search(
-        g, src,
+    breadth_first_search(g, src,
         visitor(make_bfs_visitor(record_distances(
                     make_iterator_property_map(distance.begin(), index_map),
-                    boost::on_tree_edge()))).vertex_index_map(index_map));
+                    boost::on_tree_edge()))));
 
     DEBUG_PRINTF("d[accept]=%s, d[acceptEod]=%s\n",
                  distance.at(NODE_ACCEPT).str().c_str(),
@@ -130,7 +129,7 @@ depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
 static
 depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
                    NFAVertex src) {
-    if (isLeafNode(src, h.g)) {
+    if (isLeafNode(src, h)) {
         return depth::unreachable();
     }
 
@@ -139,7 +138,7 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
         return depth::infinity();
     }
 
-    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
+    boost::filtered_graph<NGHolder, SpecialEdgeFilter> g(h, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -149,11 +148,9 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
     auto index_map = get(&NFAGraphVertexProps::index, g);
 
     // DAG shortest paths with negative edge weights.
-    dag_shortest_paths(
-        g, src,
+    dag_shortest_paths(g, src,
         distance_map(make_iterator_property_map(distance.begin(), index_map))
             .weight_map(boost::make_constant_property<NFAEdge>(-1))
-            .vertex_index_map(index_map)
             .color_map(make_iterator_property_map(colors.begin(), index_map)));
 
     depth acceptDepth, acceptEodDepth;
diff --git a/src/parser/prefilter.cpp b/src/parser/prefilter.cpp
index ea58a134..f69362e4 100644
--- a/src/parser/prefilter.cpp
+++ b/src/parser/prefilter.cpp
@@ -295,6 +295,16 @@ public:
 
     Component *visit(ComponentWordBoundary *c) override {
         assert(c);
+
+        // TODO: Right now, we do not have correct code for resolving these
+        // when prefiltering is on, UCP is on, and UTF-8 is *off*. For now, we
+        // just replace with an empty sequence (as that will return a superset
+        // of matches).
+        if (mode.ucp && !mode.utf8) {
+            return new ComponentSequence();
+        }
+
+        // All other cases can be prefiltered.
         c->setPrefilter(true);
         return c;
     }
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index 017a6bf0..82537241 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -401,7 +401,7 @@ hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc,
     scratch->tctxt.mpv_inactive = 0;
 
     /* we know it is going to be an mpv, skip the indirection */
-    next_pos_match_loc = nfaExecMpv0_QueueExecRaw(q->nfa, q, loc);
+    next_pos_match_loc = nfaExecMpv_QueueExecRaw(q->nfa, q, loc);
     assert(!q->report_current);
 
     if (!next_pos_match_loc) { /* 0 means dead */
@@ -441,7 +441,7 @@ char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) {
     const struct RoseContext *tctxt = &scratch->tctxt;
     assert(tctxt->curr_qi < rose->queueCount);
     if (tctxt->curr_qi < rose->outfixBeginQueue) {
-        assert(getNfaByQueue(rose, tctxt->curr_qi)->type == MPV_NFA_0);
+        assert(getNfaByQueue(rose, tctxt->curr_qi)->type == MPV_NFA);
         return 1;
     }
     return 0;
diff --git a/src/rose/init.c b/src/rose/init.c
index 511eafe4..025ecca0 100644
--- a/src/rose/init.c
+++ b/src/rose/init.c
@@ -85,9 +85,4 @@ void roseInitState(const struct RoseEngine *t, char *state) {
 
     init_state(t, state);
     init_outfixes(t, state);
-
-    // Clear the floating matcher state, if any.
-    DEBUG_PRINTF("clearing %u bytes of floating matcher state\n",
-                 t->floatingStreamState);
-    memset(getFloatingMatcherState(t, state), 0, t->floatingStreamState);
 }
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 8bf41715..5b2c829f 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -45,11 +45,13 @@
 #include "rose_program.h"
 #include "rose_types.h"
 #include "validate_mask.h"
+#include "validate_shufti.h"
 #include "runtime.h"
 #include "scratch.h"
 #include "ue2common.h"
 #include "hwlm/hwlm.h" // for hwlmcb_rv_t
 #include "util/compare.h"
+#include "util/copybytes.h"
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
@@ -70,73 +72,6 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
 
 /* Inline implementation follows. */
 
-static rose_inline
-int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
-                      const u8 *and_mask, const u8 *exp_mask) {
-    const u8 *data;
-
-    // If the check works over part of the history and part of the buffer, we
-    // create a temporary copy of the data in here so it's contiguous.
-    u8 temp[MAX_MASK2_WIDTH];
-
-    s64a buffer_offset = (s64a)end - ci->buf_offset;
-    DEBUG_PRINTF("rel offset %lld\n", buffer_offset);
-    if (buffer_offset >= mask_rewind) {
-        data = ci->buf + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else if (buffer_offset <= 0) {
-        data = ci->hbuf + ci->hlen + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else {
-        u32 shortfall = mask_rewind - buffer_offset;
-        DEBUG_PRINTF("shortfall of %u, rewind %u hlen %zu\n", shortfall,
-                     mask_rewind, ci->hlen);
-        data = temp;
-        memcpy(temp, ci->hbuf + ci->hlen - shortfall, shortfall);
-        memcpy(temp + shortfall, ci->buf, mask_rewind - shortfall);
-    }
-
-#ifdef DEBUG
-    DEBUG_PRINTF("DATA: ");
-    for (u32 i = 0; i < mask_rewind; i++) {
-        printf("%c", ourisprint(data[i]) ? data[i] : '?');
-    }
-    printf(" (len=%u)\n", mask_rewind);
-#endif
-
-    u32 len = mask_rewind;
-    while (len >= sizeof(u64a)) {
-        u64a a = unaligned_load_u64a(data);
-        a &= *(const u64a *)and_mask;
-        if (a != *(const u64a *)exp_mask) {
-            DEBUG_PRINTF("argh %016llx %016llx\n", a, *(const u64a *)exp_mask);
-            return 0;
-        }
-        data += sizeof(u64a);
-        and_mask += sizeof(u64a);
-        exp_mask += sizeof(u64a);
-        len -= sizeof(u64a);
-    }
-
-    while (len) {
-        u8 a = *data;
-        a &= *and_mask;
-        if (a != *exp_mask) {
-            DEBUG_PRINTF("argh d%02hhx =%02hhx am%02hhx  em%02hhx\n", a,
-                          *data, *and_mask, *exp_mask);
-            return 0;
-        }
-        data++;
-        and_mask++;
-        exp_mask++;
-        len--;
-    }
-
-    return 1;
-}
-
 static rose_inline
 void rosePushDelayedMatch(const struct RoseEngine *t,
                           struct hs_scratch *scratch, u32 delay,
@@ -783,6 +718,347 @@ int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask,
         return 0;
     }
 }
+
+static rose_inline
+int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
+                    const u8 *cmp_mask, const u32 neg_mask,
+                    s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    m256 data = zeroes256(); // consists of the following four parts.
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = 32; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    /* h_shift + h_len + c_len + c_shift = 32 need to be hold.*/
+
+    if (offset < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            if (offset + 32 <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 1;
+            }
+            h_shift = -(offset + (s64a)ci->hlen);
+            h_len = 32 - h_shift;
+        } else {
+            h_offset = ci->hlen + offset;
+        }
+        if (offset + 32 > 0) {
+            // part in current buffer.
+            c_len = offset + 32;
+            h_len = -(offset + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_32_bytes((u8 *)&data - offset, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == 32);
+        copy_upto_32_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (offset + 32 > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 1;
+            }
+            c_len = ci->len - offset;
+            c_shift = 32 - c_len;
+            copy_upto_32_bytes((u8 *)&data, ci->buf + offset, c_len);
+        } else {
+            data = loadu256(ci->buf + offset);
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+    // we use valid_data_mask to blind bytes before history/in the future.
+    u32 valid_data_mask;
+    valid_data_mask = (~0u) << (h_shift + c_shift) >> (c_shift);
+
+    m256 and_mask_m256 = loadu256(and_mask);
+    m256 cmp_mask_m256 = loadu256(cmp_mask);
+    if (validateMask32(data, valid_data_mask, and_mask_m256,
+                       cmp_mask_m256, neg_mask)) {
+        DEBUG_PRINTF("Mask32 passed\n");
+        return 1;
+    }
+    return 0;
+}
+
+// get 128/256 bits data from history and current buffer.
+// return data and valid_data_mask.
+static rose_inline
+u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
+                         u8 *data, const u32 data_len) {
+    assert(data_len == 16 || data_len == 32);
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = data_len; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    if (loc < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (loc < -(s64a)ci->hlen) {
+            if (loc + data_len <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 0;
+            }
+            h_shift = -(loc + (s64a)ci->hlen);
+            h_len = data_len - h_shift;
+        } else {
+            h_offset = ci->hlen + loc;
+        }
+        if (loc + data_len > 0) {
+            // part in current buffer.
+            c_len = loc + data_len;
+            h_len = -(loc + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_32_bytes(data - loc, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == (s32)data_len);
+        copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (loc + data_len > (s64a)ci->len) {
+            if (loc >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 0;
+            }
+            c_len = ci->len - loc;
+            c_shift = data_len - c_len;
+            copy_upto_32_bytes(data, ci->buf + loc, c_len);
+        } else {
+            if (data_len == 16) {
+                storeu128(data, loadu128(ci->buf + loc));
+                return 0xffff;
+            } else {
+                storeu256(data, loadu256(ci->buf + loc));
+                return 0xffffffff;
+            }
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+
+    if (data_len == 16) {
+        return (u16)(0xffff << (h_shift + c_shift)) >> c_shift;
+    } else {
+        return (~0u) << (h_shift + c_shift) >> c_shift;
+    }
+}
+
+static rose_inline
+m128 getData128(const struct core_info *ci, s64a offset, u16 *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m128) <= ci->len) {
+        *valid_data_mask = 0xffff;
+        return loadu128(ci->buf + offset);
+    }
+    ALIGN_DIRECTIVE u8 data[sizeof(m128)];
+    *valid_data_mask = (u16)getBufferDataComplex(ci, offset, data, 16);
+    return *(m128 *)data;
+}
+
+static rose_inline
+m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m256) <= ci->len) {
+        *valid_data_mask = ~0u;
+        return loadu256(ci->buf + offset);
+    }
+    ALIGN_AVX_DIRECTIVE u8 data[sizeof(m256)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 32);
+    return *(m256 *)data;
+}
+
+static rose_inline
+int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
+                        const u8 *bucket_select_mask, u32 neg_mask,
+                        s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u16 valid_data_mask = 0;
+    m128 data = getData128(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 nib_mask_m256 = loadu256(nib_mask);
+    m128 bucket_select_mask_m128 = loadu128(bucket_select_mask);
+    if (validateShuftiMask16x8(data, nib_mask_m256,
+                               bucket_select_mask_m128,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 16x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
+                         const u8 *lo_mask, const u8 *bucket_select_mask,
+                         u32 neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u16 valid_data_mask = 0;
+    m128 data = getData128(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 data_m256 = set2x128(data);
+    m256 hi_mask_m256 = loadu256(hi_mask);
+    m256 lo_mask_m256 = loadu256(lo_mask);
+    m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
+    if (validateShuftiMask16x16(data_m256, hi_mask_m256, lo_mask_m256,
+                                bucket_select_mask_m256,
+                                neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 16x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
+                        const u8 *lo_mask, const u8 *bucket_select_mask,
+                        u32 neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m256 data = getData256(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m128 hi_mask_m128 = loadu128(hi_mask);
+    m128 lo_mask_m128 = loadu128(lo_mask);
+    m256 hi_mask_m256 = set2x128(hi_mask_m128);
+    m256 lo_mask_m256 = set2x128(lo_mask_m128);
+    m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
+    if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
+                               bucket_select_mask_m256,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 32x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
+                         const u8 *lo_mask, const u8 *bucket_select_mask_hi,
+                         const u8 *bucket_select_mask_lo, u32 neg_mask,
+                         s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 valid_data_mask = 0;
+    m256 data = getData256(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m256 hi_mask_1 = loadu2x128(hi_mask);
+    m256 hi_mask_2 = loadu2x128(hi_mask + 16);
+    m256 lo_mask_1 = loadu2x128(lo_mask);
+    m256 lo_mask_2 = loadu2x128(lo_mask + 16);
+
+    m256 bucket_mask_hi = loadu256(bucket_select_mask_hi);
+    m256 bucket_mask_lo = loadu256(bucket_select_mask_lo);
+    if (validateShuftiMask32x16(data, hi_mask_1, hi_mask_2,
+                                lo_mask_1, lo_mask_2, bucket_mask_hi,
+                                bucket_mask_lo, neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 32x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckSingleLookaround(const struct RoseEngine *t,
+                              const struct hs_scratch *scratch,
+                              s8 checkOffset, u32 lookaroundIndex, u64a end) {
+    assert(lookaroundIndex != MO_INVALID_IDX);
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s64a base_offset = end - ci->buf_offset;
+    const s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+    DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    const u8 *reach_base = (const u8 *)t + t->lookaroundReachOffset;
+    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+
+    u8 c;
+    if (offset >= 0 && offset < (s64a)ci->len) {
+        c = ci->buf[offset];
+    } else if (offset < 0 && offset >= -(s64a)ci->hlen) {
+        c = ci->hbuf[ci->hlen + offset];
+    } else {
+        return 1;
+    }
+
+    if (!reachHasBit(reach, c)) {
+        DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+        return 0;
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
 /**
  * \brief Scan around a literal, checking that that "lookaround" reach masks
  * are satisfied.
@@ -1055,6 +1331,78 @@ hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
     return HWLM_CONTINUE_MATCHING;
 }
 
+static rose_inline
+int roseCheckLongLiteral(const struct RoseEngine *t,
+                         const struct hs_scratch *scratch, u64a end,
+                         u32 lit_offset, u32 lit_length, char nocase) {
+    const struct core_info *ci = &scratch->core_info;
+    const u8 *lit = getByOffset(t, lit_offset);
+
+    DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length);
+    DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset);
+
+    if (end < lit_length) {
+        DEBUG_PRINTF("too short!\n");
+        return 0;
+    }
+
+    // If any portion of the literal matched in the current buffer, check it.
+    if (end > ci->buf_offset) {
+        u32 scan_len = MIN(end - ci->buf_offset, lit_length);
+        u64a scan_start = end - ci->buf_offset - scan_len;
+        DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len,
+                     scan_start, end);
+        if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len,
+                       scan_len, nocase)) {
+            DEBUG_PRINTF("cmp of suffix failed\n");
+            return 0;
+        }
+    }
+
+    // If the entirety of the literal was in the current block, we are done.
+    if (end - lit_length >= ci->buf_offset) {
+        DEBUG_PRINTF("literal confirmed in current block\n");
+        return 1;
+    }
+
+    // We still have a prefix which we must test against the buffer prepared by
+    // the long literal table. This is only done in streaming mode.
+
+    assert(t->mode != HS_MODE_BLOCK);
+
+    const u8 *ll_buf;
+    size_t ll_len;
+    if (nocase) {
+        ll_buf = scratch->tctxt.ll_buf_nocase;
+        ll_len = scratch->tctxt.ll_len_nocase;
+    } else {
+        ll_buf = scratch->tctxt.ll_buf;
+        ll_len = scratch->tctxt.ll_len;
+    }
+
+    assert(ll_buf);
+
+    u64a lit_start_offset = end - lit_length;
+    u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset);
+    u32 hist_rewind = ci->buf_offset - lit_start_offset;
+    DEBUG_PRINTF("ll_len=%zu, hist_rewind=%u\n", ll_len, hist_rewind);
+    if (hist_rewind > ll_len) {
+        DEBUG_PRINTF("not enough history\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n",
+                 prefix_len, ll_len, hist_rewind);
+    assert(hist_rewind <= ll_len);
+    if (cmpForward(ll_buf + ll_len - hist_rewind, lit, prefix_len, nocase)) {
+        DEBUG_PRINTF("cmp of prefix failed\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("cmp succeeded\n");
+    return 1;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -1080,7 +1428,7 @@ void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
 static rose_inline
 hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                              struct hs_scratch *scratch, u32 programOffset,
-                             u64a som, u64a end, size_t match_len,
+                             u64a som, u64a end, UNUSED size_t match_len,
                              u8 prog_flags) {
     DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
                  som, end, prog_flags);
@@ -1113,9 +1461,15 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
         assert(pc >= pc_base);
         assert((size_t)(pc - pc_base) < t->size);
         const u8 code = *(const u8 *)pc;
-        assert(code <= ROSE_INSTR_END);
+        assert(code <= LAST_ROSE_INSTRUCTION);
 
         switch ((enum RoseInstructionCode)code) {
+            PROGRAM_CASE(END) {
+                DEBUG_PRINTF("finished\n");
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(ANCHORED_DELAY) {
                 if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
                     DEBUG_PRINTF("delay until playback\n");
@@ -1128,17 +1482,6 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(CHECK_LIT_MASK) {
-                assert(match_len);
-                struct core_info *ci = &scratch->core_info;
-                if (!roseCheckBenefits(ci, end, match_len, ri->and_mask.a8,
-                                       ri->cmp_mask.a8)) {
-                    DEBUG_PRINTF("halt: failed mask check\n");
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
             PROGRAM_CASE(CHECK_LIT_EARLY) {
                 if (end < ri->min_offset) {
                     DEBUG_PRINTF("halt: before min_offset=%u\n",
@@ -1190,6 +1533,17 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) {
+                if (!roseCheckSingleLookaround(t, scratch, ri->offset,
+                                               ri->reach_index, end)) {
+                    DEBUG_PRINTF("failed lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_LOOKAROUND) {
                 if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
                                          end)) {
@@ -1213,6 +1567,17 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK_32) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_BYTE) {
                 const struct core_info *ci = &scratch->core_info;
                 if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
@@ -1225,6 +1590,55 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SHUFTI_16x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti16x8(ci, ri->nib_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti32x8(ci, ri->hi_mask, ri->lo_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti16x16(ci, ri->hi_mask, ri->lo_mask,
+                                          ri->bucket_select_mask,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti32x16(ci, ri->hi_mask, ri->lo_mask,
+                                          ri->bucket_select_mask_hi,
+                                          ri->bucket_select_mask_lo,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri-> fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
                                    end)) {
@@ -1590,6 +2004,28 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(SPARSE_ITER_ANY) {
+                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
+                                                &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+                DEBUG_PRINTF("state %u (idx=%u) is on\n", i, idx);
+                fatbit_clear(scratch->handled_roles);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(ENGINES_EOD) {
                 if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
                     HWLM_TERMINATE_MATCHING) {
@@ -1614,9 +2050,23 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(END) {
-                DEBUG_PRINTF("finished\n");
-                return HWLM_CONTINUE_MATCHING;
+            PROGRAM_CASE(CHECK_LONG_LIT) {
+                const char nocase = 0;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("halt: failed long lit check\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
+                                          ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("halt: failed nocase long lit check\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
             }
             PROGRAM_NEXT_INSTRUCTION
         }
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 0f0e8d18..8b10bc7d 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -112,11 +112,10 @@ RoseVertex createVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset,
     RoseGraph &g = build->g;
     // add to tree
     RoseVertex v = add_vertex(g);
-    g[v].idx = build->vertexIndex++;
     g[v].min_offset = min_offset;
     g[v].max_offset = max_offset;
 
-    DEBUG_PRINTF("insert vertex %zu into literal %u's vertex set\n", g[v].idx,
+    DEBUG_PRINTF("insert vertex %zu into literal %u's vertex set\n", g[v].index,
                  literalId);
     g[v].literals.insert(literalId);
     build->literal_info[literalId].vertices.insert(v);
@@ -137,10 +136,7 @@ RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent,
     /* fill in report information */
     g[v].reports.insert(reports.begin(), reports.end());
 
-    RoseEdge e;
-    bool added;
-    tie(e, added) = add_edge(parent, v, g);
-    assert(added);
+    RoseEdge e = add_edge(parent, v, g);
     DEBUG_PRINTF("adding edge (%u, %u) to parent\n", minBound, maxBound);
 
     g[e].minBound = minBound;
@@ -167,10 +163,10 @@ RoseVertex createAnchoredVertex(RoseBuildImpl *build, u32 literalId,
     RoseGraph &g = build->g;
     RoseVertex v = createVertex(build, literalId, min_offset, max_offset);
 
-    DEBUG_PRINTF("created anchored vertex %zu with lit id %u\n", g[v].idx,
+    DEBUG_PRINTF("created anchored vertex %zu with lit id %u\n", g[v].index,
                  literalId);
 
-    RoseEdge e = add_edge(build->anchored_root, v, g).first;
+    RoseEdge e = add_edge(build->anchored_root, v, g);
     g[e].minBound = min_offset;
     g[e].maxBound = max_offset;
 
@@ -181,8 +177,7 @@ static
 RoseVertex duplicate(RoseBuildImpl *build, RoseVertex v) {
     RoseGraph &g = build->g;
     RoseVertex w = add_vertex(g[v], g);
-    g[w].idx = build->vertexIndex++;
-    DEBUG_PRINTF("added vertex %zu\n", g[w].idx);
+    DEBUG_PRINTF("added vertex %zu\n", g[w].index);
 
     for (auto lit_id : g[w].literals) {
         build->literal_info[lit_id].vertices.insert(w);
@@ -191,7 +186,7 @@ RoseVertex duplicate(RoseBuildImpl *build, RoseVertex v) {
     for (const auto &e : in_edges_range(v, g)) {
         RoseVertex s = source(e, g);
         add_edge(s, w, g[e], g);
-        DEBUG_PRINTF("added edge (%zu,%zu)\n", g[s].idx, g[w].idx);
+        DEBUG_PRINTF("added edge (%zu,%zu)\n", g[s].index, g[w].index);
     }
 
     return w;
@@ -227,7 +222,7 @@ RoseRoleHistory selectHistory(const RoseBuildImpl &tbi, const RoseBuildData &bd,
     const bool has_bounds = g[e].minBound || (g[e].maxBound != ROSE_BOUND_INF);
 
     DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n",
-                 g[u].idx, g[v].idx, g[e].minBound, g[e].maxBound,
+                 g[u].index, g[v].index, g[e].minBound, g[e].maxBound,
                  (int)g[u].fixedOffset(), (int)g[v].left);
 
     if (g[v].left) {
@@ -309,7 +304,7 @@ void createVertices(RoseBuildImpl *tbi,
                 DEBUG_PRINTF("set som_adjust to %u\n", g[w].som_adjust);
             }
 
-            DEBUG_PRINTF("  adding new vertex idx=%zu\n", tbi->g[w].idx);
+            DEBUG_PRINTF("  adding new vertex index=%zu\n", tbi->g[w].index);
             vertex_map[iv].push_back(w);
         } else {
             w = created[key];
@@ -317,10 +312,7 @@ void createVertices(RoseBuildImpl *tbi,
 
         RoseVertex p = pv.first;
 
-        RoseEdge e;
-        bool added;
-        tie(e, added) = add_edge(p, w, g);
-        assert(added);
+        RoseEdge e = add_edge(p, w, g);
         DEBUG_PRINTF("adding edge (%u,%u) to parent\n", edge_props.minBound,
                      edge_props.maxBound);
         g[e].minBound = edge_props.minBound;
@@ -358,7 +350,7 @@ void createVertices(RoseBuildImpl *tbi,
 
         for (const auto &pv : parents) {
             const RoseInEdgeProps &edge_props = bd.ig[pv.second];
-            RoseEdge e = add_edge(pv.first, g_v, tbi->g).first;
+            RoseEdge e = add_edge(pv.first, g_v, tbi->g);
             g[e].minBound = edge_props.minBound;
             g[e].maxBound = edge_props.maxBound;
             g[e].history = selectHistory(*tbi, bd, pv.second, e);
@@ -383,7 +375,7 @@ void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
         next.clear();
         for (auto curr_v : curr) {
-            DEBUG_PRINTF("handling %u\n", g[curr_v].index);
+            DEBUG_PRINTF("handling %zu\n", g[curr_v].index);
             vector<NFAVertex> next_cand;
             insert(&next_cand, next_cand.end(),
                    inv_adjacent_vertices(curr_v, g));
@@ -401,7 +393,7 @@ void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
                 const CharReach &cr = g[v].char_reach;
 
                 if (!overlaps(*it, cr)) {
-                    DEBUG_PRINTF("false edge %u\n", g[v].index);
+                    DEBUG_PRINTF("false edge %zu\n", g[v].index);
                     continue;
                 }
 
@@ -409,7 +401,7 @@ void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
                 clone_in_edges(g, v, v2);
                 add_edge(v2, curr_v, g);
                 g[v2].char_reach &= *it;
-                DEBUG_PRINTF("next <- %u\n", g[v2].index);
+                DEBUG_PRINTF("next <- %zu\n", g[v2].index);
                 next.insert(v2);
             }
         }
@@ -557,7 +549,7 @@ void findRoseLiteralMask(const NGHolder &h, const u32 lag, vector<u8> &msk,
         next.clear();
         CharReach cr;
         for (auto v : curr) {
-            DEBUG_PRINTF("vertex %u, reach %s\n", h[v].index,
+            DEBUG_PRINTF("vertex %zu, reach %s\n", h[v].index,
                          describeClass(h[v].char_reach).c_str());
             cr |= h[v].char_reach;
             insert(&next, inv_adjacent_vertices(v, h));
@@ -705,14 +697,13 @@ void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
 
     for (const auto &report_mapping : report_remap) {
         RoseVertex v = add_vertex(g);
-        g[v].idx = build.vertexIndex++;
         g[v].literals.insert(eod_event);
         build.literal_info[eod_event].vertices.insert(v);
 
         g[v].left.graph = eod_leftfix;
         g[v].left.leftfix_report = report_mapping.second;
         g[v].left.lag = 0;
-        RoseEdge e1 = add_edge(u, v, g).first;
+        RoseEdge e1 = add_edge(u, v, g);
         g[e1].minBound = 0;
         g[e1].maxBound = ROSE_BOUND_INF;
         g[v].min_offset = add_rose_depth(g[u].min_offset,
@@ -728,16 +719,15 @@ void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
 
         g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
         RoseVertex w = add_vertex(g);
-        g[w].idx = build.vertexIndex++;
         g[w].eod_accept = true;
         g[w].reports = report_mapping.first;
         g[w].min_offset = g[v].min_offset;
         g[w].max_offset = g[v].max_offset;
-        RoseEdge e = add_edge(v, w, g).first;
+        RoseEdge e = add_edge(v, w, g);
         g[e].minBound = 0;
         g[e].maxBound = 0;
         g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
-        DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+        DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index);
     }
 }
 
@@ -769,7 +759,7 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
             || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g)
                 && !edge_props.graph)
             || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) {
-            DEBUG_PRINTF("duplicating for parent %zu\n", g[u].idx);
+            DEBUG_PRINTF("duplicating for parent %zu\n", g[u].index);
             assert(!tbi->isAnyStart(u));
             u = duplicate(tbi, u);
             g[u].suffix.reset();
@@ -780,20 +770,20 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
         if (ig[iv].type == RIV_ACCEPT) {
             assert(!tbi->isAnyStart(u));
             if (contains(bd.early_dfas, edge_props.graph.get())) {
-                DEBUG_PRINTF("adding early dfa suffix to i%zu\n", g[u].idx);
+                DEBUG_PRINTF("adding early dfa suffix to i%zu\n", g[u].index);
                 g[u].suffix.rdfa = bd.early_dfas.at(edge_props.graph.get());
                 g[u].suffix.dfa_min_width = findMinWidth(*edge_props.graph);
                 g[u].suffix.dfa_max_width = findMaxWidth(*edge_props.graph);
             } else if (edge_props.graph) {
-                DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx);
+                DEBUG_PRINTF("adding suffix to i%zu\n", g[u].index);
                 g[u].suffix.graph = edge_props.graph;
                 assert(g[u].suffix.graph->kind == NFA_SUFFIX);
                 /* TODO: set dfa_(min|max)_width */
             } else if (edge_props.haig) {
-                DEBUG_PRINTF("adding suffaig to i%zu\n", g[u].idx);
+                DEBUG_PRINTF("adding suffaig to i%zu\n", g[u].index);
                 g[u].suffix.haig = edge_props.haig;
             } else {
-                DEBUG_PRINTF("adding boring accept to i%zu\n", g[u].idx);
+                DEBUG_PRINTF("adding boring accept to i%zu\n", g[u].index);
                 assert(!g[u].eod_accept);
                 g[u].reports = ig[iv].reports;
             }
@@ -803,16 +793,15 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
 
             if (!edge_props.graph) {
                 RoseVertex w = add_vertex(g);
-                g[w].idx = tbi->vertexIndex++;
                 g[w].eod_accept = true;
                 g[w].reports = ig[iv].reports;
                 g[w].min_offset = g[u].min_offset;
                 g[w].max_offset = g[u].max_offset;
-                RoseEdge e = add_edge(u, w, g).first;
+                RoseEdge e = add_edge(u, w, g);
                 g[e].minBound = 0;
                 g[e].maxBound = 0;
                 g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
-                DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+                DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index);
                 continue;
             }
 
@@ -824,7 +813,7 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
                 assert(h.kind == NFA_SUFFIX);
                 assert(!tbi->isAnyStart(u));
                 /* etable can't/shouldn't use eod event */
-                DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx);
+                DEBUG_PRINTF("adding suffix to i%zu\n", g[u].index);
                 g[u].suffix.graph = edge_props.graph;
                 continue;
             }
@@ -976,7 +965,7 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
            || ig[v_order.front()].type == RIV_ANCHORED_START);
 
     for (RoseInVertex iv : v_order) {
-        DEBUG_PRINTF("vertex %p\n", iv);
+        DEBUG_PRINTF("vertex %zu\n", ig[iv].index);
 
         if (ig[iv].type == RIV_START) {
             DEBUG_PRINTF("is root\n");
@@ -1588,6 +1577,7 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
                             bool finalChance) {
     DEBUG_PRINTF("trying to rose\n");
     assert(validateKinds(ig));
+    assert(hasCorrectlyNumberedVertices(ig));
 
     if (::ue2::empty(ig)) {
         assert(0);
@@ -1603,7 +1593,8 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
     transformAnchoredLiteralOverlap(in, bd, cc);
     transformSuffixDelay(in, cc);
 
-    assert(validateKinds(ig));
+    renumber_vertices(in);
+    assert(validateKinds(in));
 
     map<NGHolder *, vector<RoseInEdge> > graphs;
     vector<NGHolder *> ordered_graphs; // Stored in first-encounter order.
@@ -1619,6 +1610,8 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
         }
 
         NGHolder *h = in[e].graph.get();
+
+        assert(isCorrectlyTopped(*h));
         if (!contains(graphs, h)) {
             ordered_graphs.push_back(h);
         }
@@ -1760,8 +1753,7 @@ static
 u32 findMaxBAWidth(const NGHolder &h) {
     // Must be bi-anchored: no out-edges from startDs (other than its
     // self-loop), no in-edges to accept.
-    if (hasGreaterOutDegree(1, h.startDs, h) ||
-        hasGreaterInDegree(0, h.accept, h)) {
+    if (out_degree(h.startDs, h) > 1 || in_degree(h.accept, h)) {
         return ROSE_BOUND_INF;
     }
     depth d = findMaxWidth(h);
@@ -1887,9 +1879,9 @@ bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w,
                                  map<ReportID, u32> &allocated_reports,
                                  flat_set<u32> &added_lit_ids) {
     const depth max_anchored_depth(tbi.cc.grey.maxAnchoredRegion);
-    const u32 idx = w[u].index;
-    assert(idx < vertexDepths.size());
-    const DepthMinMax &d = vertexDepths.at(idx);
+    const size_t index = w[u].index;
+    assert(index < vertexDepths.size());
+    const DepthMinMax &d = vertexDepths.at(index);
 
     for (const auto &int_report : w[u].reports) {
         assert(int_report != MO_INVALID_IDX);
@@ -2006,7 +1998,6 @@ bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) {
             RoseVertex v
                 = createAnchoredVertex(this, lit_id, minBound, maxBound);
             RoseVertex eod = add_vertex(g);
-            g[eod].idx = vertexIndex++;
             g[eod].eod_accept = true;
             g[eod].reports.insert(report);
             g[eod].min_offset = g[v].min_offset;
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index 45333a38..de3bdf0a 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -532,7 +532,7 @@ void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
             g[v].left.leftfix_report = mask_report;
         } else {
             // Make sure our edge bounds are correct.
-            auto e = edge_by_target(parent, v, g).first;
+            RoseEdge e = edge(parent, v, g);
             g[e].minBound = 0;
             g[e].maxBound = anchored ? 0 : ROSE_BOUND_INF;
             g[e].history = anchored ? ROSE_ROLE_HISTORY_ANCH
@@ -544,7 +544,7 @@ void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
         g[v].max_offset = v_max_offset;
 
         if (eod) {
-            auto e = add_edge(v, eod_v, g).first;
+            RoseEdge e = add_edge(v, eod_v, g);
             g[e].minBound = 0;
             g[e].maxBound = 0;
             g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
@@ -574,7 +574,8 @@ unique_ptr<NGHolder> buildMaskRhs(const ue2::flat_set<ReportID> &reports,
         succ = u;
     }
 
-    add_edge(h.start, succ, h);
+    NFAEdge e = add_edge(h.start, succ, h);
+    h[e].tops.insert(DEFAULT_TOP);
 
     return rhs;
 }
@@ -632,6 +633,7 @@ void doAddMask(RoseBuildImpl &tbi, bool anchored,
                     = buildMaskLhs(true, minBound - prefix2_len + overlap,
                                    mask3);
                 mhs->kind = NFA_INFIX;
+                setTops(*mhs);
                 add_edge(u, v, RoseInEdgeProps(mhs, delay), ig);
 
                 DEBUG_PRINTF("add anch literal too!\n");
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 60732ff9..3d0affc6 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -549,7 +549,7 @@ bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
 
     /* lit should only be connected to dot vertices */
     for (auto u : inv_adjacent_vertices_range(lit_head, h)) {
-        DEBUG_PRINTF("checking %u\n", h[u].index);
+        DEBUG_PRINTF("checking %zu\n", h[u].index);
         if (!h[u].char_reach.all()) {
             return false;
         }
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 56591de8..9f4abcad 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -33,20 +33,26 @@
 #include "hs_compile.h" // for HS_MODE_*
 #include "rose_build_add_internal.h"
 #include "rose_build_anchored.h"
+#include "rose_build_engine_blob.h"
 #include "rose_build_exclusive.h"
 #include "rose_build_groups.h"
 #include "rose_build_infix.h"
+#include "rose_build_long_lit.h"
 #include "rose_build_lookaround.h"
 #include "rose_build_matchers.h"
+#include "rose_build_program.h"
 #include "rose_build_scatter.h"
 #include "rose_build_util.h"
 #include "rose_build_width.h"
+#include "rose_internal.h"
 #include "rose_program.h"
 #include "hwlm/hwlm.h" /* engine types */
+#include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
 #include "nfa/goughcompile.h"
 #include "nfa/mcclellancompile.h"
 #include "nfa/mcclellancompile_util.h"
+#include "nfa/mcsheng_compile.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
@@ -75,6 +81,7 @@
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/container.h"
+#include "util/fatbit_build.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
@@ -86,6 +93,7 @@
 #include "util/verify_types.h"
 
 #include <algorithm>
+#include <array>
 #include <map>
 #include <queue>
 #include <set>
@@ -147,218 +155,6 @@ struct left_build_info {
     vector<LookEntry> lookaround; // alternative implementation to the NFA
 };
 
-/**
- * \brief Possible jump targets for roles that perform checks.
- *
- * Fixed up into offsets before the program is written to bytecode.
- */
-enum class JumpTarget {
-    NO_JUMP,        //!< Instruction does not jump.
-    PROGRAM_END,    //!< Jump to end of program.
-    NEXT_BLOCK,     //!< Jump to start of next block (sparse iter check, etc).
-    FIXUP_DONE,     //!< Target fixup already applied.
-};
-
-/** \brief Role instruction model used at compile time. */
-class RoseInstruction {
-public:
-    RoseInstruction(enum RoseInstructionCode c, JumpTarget j) : target(j) {
-        memset(&u, 0, sizeof(u));
-        u.end.code = c;
-    }
-
-    explicit RoseInstruction(enum RoseInstructionCode c)
-        : RoseInstruction(c, JumpTarget::NO_JUMP) {}
-
-    bool operator<(const RoseInstruction &a) const {
-        if (code() != a.code()) {
-            return code() < a.code();
-        }
-        if (target != a.target) {
-            return target < a.target;
-        }
-        return memcmp(&u, &a.u, sizeof(u)) < 0;
-    }
-
-    bool operator==(const RoseInstruction &a) const {
-        return code() == a.code() && target == a.target &&
-               memcmp(&u, &a.u, sizeof(u)) == 0;
-    }
-
-    enum RoseInstructionCode code() const {
-        // Note that this sort of type-punning (relying on identical initial
-        // layout) is explicitly allowed by the C++11 standard.
-        return (enum RoseInstructionCode)u.end.code;
-    }
-
-    const void *get() const {
-        switch (code()) {
-        case ROSE_INSTR_CHECK_LIT_MASK: return &u.checkLitMask;
-        case ROSE_INSTR_CHECK_LIT_EARLY: return &u.checkLitEarly;
-        case ROSE_INSTR_CHECK_GROUPS: return &u.checkGroups;
-        case ROSE_INSTR_CHECK_ONLY_EOD: return &u.checkOnlyEod;
-        case ROSE_INSTR_CHECK_BOUNDS: return &u.checkBounds;
-        case ROSE_INSTR_CHECK_NOT_HANDLED: return &u.checkNotHandled;
-        case ROSE_INSTR_CHECK_LOOKAROUND: return &u.checkLookaround;
-        case ROSE_INSTR_CHECK_MASK: return &u.checkMask;
-        case ROSE_INSTR_CHECK_BYTE: return &u.checkByte;
-        case ROSE_INSTR_CHECK_INFIX: return &u.checkInfix;
-        case ROSE_INSTR_CHECK_PREFIX: return &u.checkPrefix;
-        case ROSE_INSTR_ANCHORED_DELAY: return &u.anchoredDelay;
-        case ROSE_INSTR_PUSH_DELAYED: return &u.pushDelayed;
-        case ROSE_INSTR_RECORD_ANCHORED: return &u.recordAnchored;
-        case ROSE_INSTR_CATCH_UP: return &u.catchUp;
-        case ROSE_INSTR_CATCH_UP_MPV: return &u.catchUpMpv;
-        case ROSE_INSTR_SOM_ADJUST: return &u.somAdjust;
-        case ROSE_INSTR_SOM_LEFTFIX: return &u.somLeftfix;
-        case ROSE_INSTR_SOM_FROM_REPORT: return &u.somFromReport;
-        case ROSE_INSTR_SOM_ZERO: return &u.somZero;
-        case ROSE_INSTR_TRIGGER_INFIX: return &u.triggerInfix;
-        case ROSE_INSTR_TRIGGER_SUFFIX: return &u.triggerSuffix;
-        case ROSE_INSTR_DEDUPE: return &u.dedupe;
-        case ROSE_INSTR_DEDUPE_SOM: return &u.dedupeSom;
-        case ROSE_INSTR_REPORT_CHAIN: return &u.reportChain;
-        case ROSE_INSTR_REPORT_SOM_INT: return &u.reportSomInt;
-        case ROSE_INSTR_REPORT_SOM_AWARE: return &u.reportSomAware;
-        case ROSE_INSTR_REPORT: return &u.report;
-        case ROSE_INSTR_REPORT_EXHAUST: return &u.reportExhaust;
-        case ROSE_INSTR_REPORT_SOM: return &u.reportSom;
-        case ROSE_INSTR_REPORT_SOM_EXHAUST: return &u.reportSomExhaust;
-        case ROSE_INSTR_DEDUPE_AND_REPORT: return &u.dedupeAndReport;
-        case ROSE_INSTR_FINAL_REPORT: return &u.finalReport;
-        case ROSE_INSTR_CHECK_EXHAUSTED: return &u.checkExhausted;
-        case ROSE_INSTR_CHECK_MIN_LENGTH: return &u.checkMinLength;
-        case ROSE_INSTR_SET_STATE: return &u.setState;
-        case ROSE_INSTR_SET_GROUPS: return &u.setGroups;
-        case ROSE_INSTR_SQUASH_GROUPS: return &u.squashGroups;
-        case ROSE_INSTR_CHECK_STATE: return &u.checkState;
-        case ROSE_INSTR_SPARSE_ITER_BEGIN: return &u.sparseIterBegin;
-        case ROSE_INSTR_SPARSE_ITER_NEXT: return &u.sparseIterNext;
-        case ROSE_INSTR_ENGINES_EOD: return &u.enginesEod;
-        case ROSE_INSTR_SUFFIXES_EOD: return &u.suffixesEod;
-        case ROSE_INSTR_MATCHER_EOD: return &u.matcherEod;
-        case ROSE_INSTR_END: return &u.end;
-        }
-        assert(0);
-        return &u.end;
-    }
-
-    size_t length() const {
-        switch (code()) {
-        case ROSE_INSTR_CHECK_LIT_MASK: return sizeof(u.checkLitMask);
-        case ROSE_INSTR_CHECK_LIT_EARLY: return sizeof(u.checkLitEarly);
-        case ROSE_INSTR_CHECK_GROUPS: return sizeof(u.checkGroups);
-        case ROSE_INSTR_CHECK_ONLY_EOD: return sizeof(u.checkOnlyEod);
-        case ROSE_INSTR_CHECK_BOUNDS: return sizeof(u.checkBounds);
-        case ROSE_INSTR_CHECK_NOT_HANDLED: return sizeof(u.checkNotHandled);
-        case ROSE_INSTR_CHECK_LOOKAROUND: return sizeof(u.checkLookaround);
-        case ROSE_INSTR_CHECK_MASK: return sizeof(u.checkMask);
-        case ROSE_INSTR_CHECK_BYTE: return sizeof(u.checkByte);
-        case ROSE_INSTR_CHECK_INFIX: return sizeof(u.checkInfix);
-        case ROSE_INSTR_CHECK_PREFIX: return sizeof(u.checkPrefix);
-        case ROSE_INSTR_ANCHORED_DELAY: return sizeof(u.anchoredDelay);
-        case ROSE_INSTR_PUSH_DELAYED: return sizeof(u.pushDelayed);
-        case ROSE_INSTR_RECORD_ANCHORED: return sizeof(u.recordAnchored);
-        case ROSE_INSTR_CATCH_UP: return sizeof(u.catchUp);
-        case ROSE_INSTR_CATCH_UP_MPV: return sizeof(u.catchUpMpv);
-        case ROSE_INSTR_SOM_ADJUST: return sizeof(u.somAdjust);
-        case ROSE_INSTR_SOM_LEFTFIX: return sizeof(u.somLeftfix);
-        case ROSE_INSTR_SOM_FROM_REPORT: return sizeof(u.somFromReport);
-        case ROSE_INSTR_SOM_ZERO: return sizeof(u.somZero);
-        case ROSE_INSTR_TRIGGER_INFIX: return sizeof(u.triggerInfix);
-        case ROSE_INSTR_TRIGGER_SUFFIX: return sizeof(u.triggerSuffix);
-        case ROSE_INSTR_DEDUPE: return sizeof(u.dedupe);
-        case ROSE_INSTR_DEDUPE_SOM: return sizeof(u.dedupeSom);
-        case ROSE_INSTR_REPORT_CHAIN: return sizeof(u.reportChain);
-        case ROSE_INSTR_REPORT_SOM_INT: return sizeof(u.reportSomInt);
-        case ROSE_INSTR_REPORT_SOM_AWARE: return sizeof(u.reportSomAware);
-        case ROSE_INSTR_REPORT: return sizeof(u.report);
-        case ROSE_INSTR_REPORT_EXHAUST: return sizeof(u.reportExhaust);
-        case ROSE_INSTR_REPORT_SOM: return sizeof(u.reportSom);
-        case ROSE_INSTR_REPORT_SOM_EXHAUST: return sizeof(u.reportSomExhaust);
-        case ROSE_INSTR_DEDUPE_AND_REPORT: return sizeof(u.dedupeAndReport);
-        case ROSE_INSTR_FINAL_REPORT: return sizeof(u.finalReport);
-        case ROSE_INSTR_CHECK_EXHAUSTED: return sizeof(u.checkExhausted);
-        case ROSE_INSTR_CHECK_MIN_LENGTH: return sizeof(u.checkMinLength);
-        case ROSE_INSTR_SET_STATE: return sizeof(u.setState);
-        case ROSE_INSTR_SET_GROUPS: return sizeof(u.setGroups);
-        case ROSE_INSTR_SQUASH_GROUPS: return sizeof(u.squashGroups);
-        case ROSE_INSTR_CHECK_STATE: return sizeof(u.checkState);
-        case ROSE_INSTR_SPARSE_ITER_BEGIN: return sizeof(u.sparseIterBegin);
-        case ROSE_INSTR_SPARSE_ITER_NEXT: return sizeof(u.sparseIterNext);
-        case ROSE_INSTR_ENGINES_EOD: return sizeof(u.enginesEod);
-        case ROSE_INSTR_SUFFIXES_EOD: return sizeof(u.suffixesEod);
-        case ROSE_INSTR_MATCHER_EOD: return sizeof(u.matcherEod);
-        case ROSE_INSTR_END: return sizeof(u.end);
-        }
-        assert(0);
-        return 0;
-    }
-
-    union {
-        ROSE_STRUCT_CHECK_LIT_MASK checkLitMask;
-        ROSE_STRUCT_CHECK_LIT_EARLY checkLitEarly;
-        ROSE_STRUCT_CHECK_GROUPS checkGroups;
-        ROSE_STRUCT_CHECK_ONLY_EOD checkOnlyEod;
-        ROSE_STRUCT_CHECK_BOUNDS checkBounds;
-        ROSE_STRUCT_CHECK_NOT_HANDLED checkNotHandled;
-        ROSE_STRUCT_CHECK_LOOKAROUND checkLookaround;
-        ROSE_STRUCT_CHECK_MASK checkMask;
-        ROSE_STRUCT_CHECK_BYTE checkByte;
-        ROSE_STRUCT_CHECK_INFIX checkInfix;
-        ROSE_STRUCT_CHECK_PREFIX checkPrefix;
-        ROSE_STRUCT_ANCHORED_DELAY anchoredDelay;
-        ROSE_STRUCT_PUSH_DELAYED pushDelayed;
-        ROSE_STRUCT_RECORD_ANCHORED recordAnchored;
-        ROSE_STRUCT_CATCH_UP catchUp;
-        ROSE_STRUCT_CATCH_UP_MPV catchUpMpv;
-        ROSE_STRUCT_SOM_ADJUST somAdjust;
-        ROSE_STRUCT_SOM_LEFTFIX somLeftfix;
-        ROSE_STRUCT_SOM_FROM_REPORT somFromReport;
-        ROSE_STRUCT_SOM_ZERO somZero;
-        ROSE_STRUCT_TRIGGER_INFIX triggerInfix;
-        ROSE_STRUCT_TRIGGER_SUFFIX triggerSuffix;
-        ROSE_STRUCT_DEDUPE dedupe;
-        ROSE_STRUCT_DEDUPE_SOM dedupeSom;
-        ROSE_STRUCT_REPORT_CHAIN reportChain;
-        ROSE_STRUCT_REPORT_SOM_INT reportSomInt;
-        ROSE_STRUCT_REPORT_SOM_AWARE reportSomAware;
-        ROSE_STRUCT_REPORT report;
-        ROSE_STRUCT_REPORT_EXHAUST reportExhaust;
-        ROSE_STRUCT_REPORT_SOM reportSom;
-        ROSE_STRUCT_REPORT_SOM_EXHAUST reportSomExhaust;
-        ROSE_STRUCT_DEDUPE_AND_REPORT dedupeAndReport;
-        ROSE_STRUCT_FINAL_REPORT finalReport;
-        ROSE_STRUCT_CHECK_EXHAUSTED checkExhausted;
-        ROSE_STRUCT_CHECK_MIN_LENGTH checkMinLength;
-        ROSE_STRUCT_SET_STATE setState;
-        ROSE_STRUCT_SET_GROUPS setGroups;
-        ROSE_STRUCT_SQUASH_GROUPS squashGroups;
-        ROSE_STRUCT_CHECK_STATE checkState;
-        ROSE_STRUCT_SPARSE_ITER_BEGIN sparseIterBegin;
-        ROSE_STRUCT_SPARSE_ITER_NEXT sparseIterNext;
-        ROSE_STRUCT_ENGINES_EOD enginesEod;
-        ROSE_STRUCT_SUFFIXES_EOD suffixesEod;
-        ROSE_STRUCT_MATCHER_EOD matcherEod;
-        ROSE_STRUCT_END end;
-    } u;
-
-    JumpTarget target;
-};
-
-static
-size_t hash_value(const RoseInstruction &ri) {
-    size_t val = 0;
-    boost::hash_combine(val, ri.code());
-    boost::hash_combine(val, ri.target);
-    const char *bytes = (const char *)ri.get();
-    const size_t len = ri.length();
-    for (size_t i = 0; i < len; i++) {
-        boost::hash_combine(val, bytes[i]);
-    }
-    return val;
-}
-
 /**
  * \brief Structure tracking which resources are used by this Rose instance at
  * runtime.
@@ -374,7 +170,7 @@ struct RoseResources {
     bool has_states = false;
     bool checks_groups = false;
     bool has_lit_delay = false;
-    bool has_lit_mask = false;
+    bool has_lit_check = false; // long literal support
     bool has_anchored = false;
     bool has_eod = false;
 };
@@ -397,13 +193,10 @@ struct build_context : boost::noncopyable {
      */
     size_t numStates = 0;
 
-    /** \brief Very simple cache from sparse iter to offset, used when building
-     * up iterators in early misc. */
-    map<vector<mmbit_sparse_iter>, u32> iterCache;
-
     /** \brief Simple cache of programs written to engine blob, used for
      * deduplication. */
-    ue2::unordered_map<vector<RoseInstruction>, u32> program_cache;
+    ue2::unordered_map<RoseProgram, u32, RoseProgramHash,
+                       RoseProgramEquivalence> program_cache;
 
     /** \brief LookEntry list cache, so that we don't have to go scanning
      * through the full list to find cases we've used already. */
@@ -423,12 +216,19 @@ struct build_context : boost::noncopyable {
      * written to the engine_blob. */
     vector<u32> litPrograms;
 
+    /** \brief List of long literals (ones with CHECK_LITERAL instructions)
+     * that need hash table support. */
+    vector<ue2_case_string> longLiterals;
+
     /** \brief Minimum offset of a match from the floating table. */
     u32 floatingMinLiteralMatchOffset = 0;
 
+    /** \brief Long literal length threshold, used in streaming mode. */
+    size_t longLitLengthThreshold = 0;
+
     /** \brief Contents of the Rose bytecode immediately following the
      * RoseEngine. */
-    vector<char, AlignedAllocator<char, 64>> engine_blob;
+    RoseEngineBlob engine_blob;
 
     /** \brief True if reports need CATCH_UP instructions, to catch up anchored
      * matches, suffixes, outfixes etc. */
@@ -446,81 +246,17 @@ struct build_context : boost::noncopyable {
 
     /** \brief Global bitmap of groups that can be squashed. */
     rose_group squashable_groups = 0;
-
-    /** \brief Base offset of engine_blob in the Rose engine bytecode. */
-    static constexpr u32 engine_blob_base = ROUNDUP_CL(sizeof(RoseEngine));
 };
 
 }
 
-static
-void pad_engine_blob(build_context &bc, size_t align) {
-    assert(ISALIGNED_N(bc.engine_blob_base, align));
-    size_t s = bc.engine_blob.size();
-
-    if (ISALIGNED_N(s, align)) {
-        return;
-    }
-
-    bc.engine_blob.resize(s + align - s % align);
-}
-
-static
-u32 add_to_engine_blob(build_context &bc, const void *a, const size_t len,
-                       const size_t align) {
-    pad_engine_blob(bc, align);
-
-    size_t rv = bc.engine_blob_base + bc.engine_blob.size();
-    assert(rv >= bc.engine_blob_base);
-    DEBUG_PRINTF("write %zu bytes at offset %zu\n", len, rv);
-
-    assert(ISALIGNED_N(bc.engine_blob.size(), align));
-
-    bc.engine_blob.resize(bc.engine_blob.size() + len);
-    memcpy(&bc.engine_blob.back() - len + 1, a, len);
-
-    return verify_u32(rv);
-}
-
-template<typename T>
-static
-u32 add_to_engine_blob(build_context &bc, const T &a) {
-    static_assert(is_pod<T>::value, "should be pod");
-    return add_to_engine_blob(bc, &a, sizeof(a), alignof(T));
-}
-
-template<typename T>
-static
-u32 add_to_engine_blob(build_context &bc, const T &a, const size_t len) {
-    static_assert(is_pod<T>::value, "should be pod");
-    return add_to_engine_blob(bc, &a, len, alignof(T));
-}
-
-template<typename Iter>
-static
-u32 add_to_engine_blob(build_context &bc, Iter b, const Iter &e) {
-    using value_type = typename std::iterator_traits<Iter>::value_type;
-    static_assert(is_pod<value_type>::value, "should be pod");
-
-    if (b == e) {
-        return 0;
-    }
-
-    u32 offset = add_to_engine_blob(bc, *b);
-    for (++b; b != e; ++b) {
-        add_to_engine_blob(bc, *b);
-    }
-
-    return offset;
-}
-
 static
 const NFA *get_nfa_from_blob(const build_context &bc, u32 qi) {
     assert(contains(bc.engineOffsets, qi));
     u32 nfa_offset = bc.engineOffsets.at(qi);
-    assert(nfa_offset >= bc.engine_blob_base);
+    assert(nfa_offset >= bc.engine_blob.base_offset);
     const NFA *n = (const NFA *)(bc.engine_blob.data() + nfa_offset -
-                                 bc.engine_blob_base);
+                                 bc.engine_blob.base_offset);
     assert(n->queueIndex == qi);
     return n;
 }
@@ -528,7 +264,7 @@ const NFA *get_nfa_from_blob(const build_context &bc, u32 qi) {
 static
 const NFA *add_nfa_to_blob(build_context &bc, NFA &nfa) {
     u32 qi = nfa.queueIndex;
-    u32 nfa_offset = add_to_engine_blob(bc, nfa, nfa.length);
+    u32 nfa_offset = bc.engine_blob.add(nfa, nfa.length);
     DEBUG_PRINTF("added nfa qi=%u, type=%u, length=%u at offset=%u\n", qi,
                   nfa.type, nfa.length, nfa_offset);
 
@@ -555,35 +291,32 @@ u32 countRosePrefixes(const vector<LeftNfaInfo> &roses) {
  * \brief True if this Rose engine needs to run a catch up whenever a report is
  * generated.
  *
- * This is only the case if there are no anchored literals, suffixes, outfixes
- * etc.
+ * Catch up is necessary if there are output-exposed engines (suffixes,
+ * outfixes) or an anchored table (anchored literals, acyclic DFAs).
  */
 static
-bool needsCatchup(const RoseBuildImpl &build) {
+bool needsCatchup(const RoseBuildImpl &build,
+                  const vector<raw_dfa> &anchored_dfas) {
     if (!build.outfixes.empty()) {
         DEBUG_PRINTF("has outfixes\n");
         return true;
     }
+    if (!anchored_dfas.empty()) {
+        DEBUG_PRINTF("has anchored dfas\n");
+        return true;
+    }
 
     const RoseGraph &g = build.g;
 
-    if (!isLeafNode(build.anchored_root, g)) {
-        DEBUG_PRINTF("has anchored vertices\n");
-        return true;
-    }
-
     for (auto v : vertices_range(g)) {
         if (build.root == v) {
             continue;
         }
-
         if (build.anchored_root == v) {
-            assert(isLeafNode(v, g));
             continue;
         }
-
         if (g[v].suffix) {
-            DEBUG_PRINTF("vertex %zu has suffix\n", g[v].idx);
+            DEBUG_PRINTF("vertex %zu has suffix\n", g[v].index);
             return true;
         }
 
@@ -594,7 +327,7 @@ bool needsCatchup(const RoseBuildImpl &build) {
 }
 
 static
-bool isPureFloating(const RoseResources &resources) {
+bool isPureFloating(const RoseResources &resources, const CompileContext &cc) {
     if (resources.has_outfixes || resources.has_suffixes ||
         resources.has_leftfixes) {
         DEBUG_PRINTF("has engines\n");
@@ -621,6 +354,12 @@ bool isPureFloating(const RoseResources &resources) {
         return false;
     }
 
+    if (cc.streaming && resources.has_lit_check) {
+        DEBUG_PRINTF("has long literals in streaming mode, which needs "
+                     "long literal table support\n");
+        return false;
+    }
+
     if (resources.checks_groups) {
         DEBUG_PRINTF("has group checks\n");
         return false;
@@ -664,11 +403,11 @@ u8 pickRuntimeImpl(const RoseBuildImpl &build, const build_context &bc,
     DEBUG_PRINTF("has_states=%d\n", bc.resources.has_states);
     DEBUG_PRINTF("checks_groups=%d\n", bc.resources.checks_groups);
     DEBUG_PRINTF("has_lit_delay=%d\n", bc.resources.has_lit_delay);
-    DEBUG_PRINTF("has_lit_mask=%d\n", bc.resources.has_lit_mask);
+    DEBUG_PRINTF("has_lit_check=%d\n", bc.resources.has_lit_check);
     DEBUG_PRINTF("has_anchored=%d\n", bc.resources.has_anchored);
     DEBUG_PRINTF("has_eod=%d\n", bc.resources.has_eod);
 
-    if (isPureFloating(bc.resources)) {
+    if (isPureFloating(bc.resources, build.cc)) {
         return ROSE_RUNTIME_PURE_LITERAL;
     }
 
@@ -708,7 +447,7 @@ static
 void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
                       u32 anchorStateSize, u32 activeArrayCount,
                       u32 activeLeftCount, u32 laggedRoseCount,
-                      u32 floatingStreamStateRequired, u32 historyRequired,
+                      u32 longLitStreamStateRequired, u32 historyRequired,
                       RoseStateOffsets *so) {
     u32 curr_offset = 0;
 
@@ -726,8 +465,8 @@ void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
     so->activeLeftArray_size = mmbit_size(activeLeftCount);
     curr_offset += so->activeLeftArray_size;
 
-    so->floatingMatcherState = curr_offset;
-    curr_offset += floatingStreamStateRequired;
+    so->longLitState = curr_offset;
+    curr_offset += longLitStreamStateRequired;
 
     // ONE WHOLE BYTE for each active leftfix with lag.
     so->leftfixLagTable = curr_offset;
@@ -793,7 +532,7 @@ bool nfaStuckOn(const NGHolder &g) {
     set<u32> done_tops;
 
     for (const auto &e : out_edges_range(g.start, g)) {
-        tops.insert(g[e].top);
+        insert(&tops, g[e].tops);
         if (!g[target(e, g)].char_reach.all()) {
             continue;
         }
@@ -802,7 +541,7 @@ bool nfaStuckOn(const NGHolder &g) {
         insert(&asucc, adjacent_vertices(target(e, g), g));
 
         if (asucc == succ) {
-            done_tops.insert(g[e].top);
+            insert(&done_tops, g[e].tops);
         }
     }
 
@@ -878,7 +617,7 @@ aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
 
     bool d_accel = has_accel(*dfa_impl);
     bool n_accel = has_accel(*nfa_impl);
-    bool d_big = dfa_impl->type == MCCLELLAN_NFA_16;
+    bool d_big = isBigDfaType(dfa_impl->type);
     bool n_vsmall = nfa_impl->nPositions <= 32;
     bool n_br = has_bounded_repeats(*nfa_impl);
     DEBUG_PRINTF("da %d na %d db %d nvs %d nbr %d\n", (int)d_accel,
@@ -929,10 +668,17 @@ buildRepeatEngine(const CastleProto &proto,
 }
 
 static
-aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
+                               const CompileContext &cc,
                                const ReportManager &rm) {
     // Unleash the Sheng!!
     auto dfa = shengCompile(rdfa, cc, rm);
+    if (!dfa && !is_transient) {
+        // Sheng wasn't successful, so unleash McClellan!
+        /* We don't try the hybrid for transient prefixes due to the extra
+         * bytecode and that they are usually run on small blocks */
+        dfa = mcshengCompile(rdfa, cc, rm);
+    }
     if (!dfa) {
         // Sheng wasn't successful, so unleash McClellan!
         dfa = mcclellanCompile(rdfa, cc, rm);
@@ -960,7 +706,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
     }
 
     if (suff.dfa()) {
-        auto d = getDfa(*suff.dfa(), cc, rm);
+        auto d = getDfa(*suff.dfa(), false, cc, rm);
         assert(d);
         return d;
     }
@@ -989,7 +735,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
-                auto d = getDfa(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, false, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
                     n = pickImpl(move(d), move(n));
@@ -1109,12 +855,12 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     }
 
     if (left.dfa()) {
-        n = getDfa(*left.dfa(), cc, rm);
+        n = getDfa(*left.dfa(), is_transient, cc, rm);
     } else if (left.graph() && cc.grey.roseMcClellanPrefix == 2 && is_prefix &&
                !is_transient) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            n = getDfa(*rdfa, cc, rm);
+            n = getDfa(*rdfa, is_transient, cc, rm);
             assert(n);
         }
     }
@@ -1123,8 +869,8 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     if (!n && !is_prefix && left.graph() && onlyOneTop(*left.graph())) {
         map<u32, vector<vector<CharReach> > > triggers;
         findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
-        assert(contains(triggers, 0)); // single top
-        n = constructLBR(*left.graph(), triggers[0], cc, rm);
+        assert(triggers.size() == 1); // single top
+        n = constructLBR(*left.graph(), triggers.begin()->second, cc, rm);
     }
 
     if (!n && left.graph()) {
@@ -1141,7 +887,7 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            auto d = getDfa(*rdfa, cc, rm);
+            auto d = getDfa(*rdfa, is_transient, cc, rm);
             assert(d);
             n = pickImpl(move(d), move(n));
         }
@@ -1210,7 +956,7 @@ void appendTailToHolder(NGHolder &h, const vector<CharReach> &tail) {
         appendTailToHolder(h, e.first, e.second, tail);
     }
 
-    h.renumberEdges();
+    renumber_edges(h);
 }
 
 static
@@ -1495,11 +1241,11 @@ void updateTops(const RoseGraph &g, const TamaInfo &tamaInfo,
     for (const auto &n : tamaInfo.subengines) {
         for (const auto &v : subengines[i].vertices) {
             if (is_suffix) {
-                tamaProto.add(n, g[v].idx, g[v].suffix.top,
+                tamaProto.add(n, g[v].index, g[v].suffix.top,
                               out_top_remap);
             } else {
                 for (const auto &e : in_edges_range(v, g)) {
-                    tamaProto.add(n, g[v].idx, g[e].rose_top,
+                    tamaProto.add(n, g[v].index, g[e].rose_top,
                                   out_top_remap);
                 }
             }
@@ -1543,7 +1289,7 @@ void buildInfixContainer(RoseGraph &g, build_context &bc,
         for (const auto &sub : subengines) {
             const auto &verts = sub.vertices;
             for (const auto &v : verts) {
-                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                DEBUG_PRINTF("vert id:%zu\n", g[v].index);
                 g[v].left.tamarama = tamaProto;
             }
         }
@@ -1562,7 +1308,7 @@ void buildSuffixContainer(RoseGraph &g, build_context &bc,
         for (const auto &sub : subengines) {
             const auto &verts = sub.vertices;
             for (const auto &v : verts) {
-                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                DEBUG_PRINTF("vert id:%zu\n", g[v].index);
                 g[v].suffix.tamarama = tamaProto;
             }
             const auto &v = verts[0];
@@ -1716,7 +1462,7 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
 
         // Sanity check: our NFA should contain each of the tops mentioned on
         // our in-edges.
-        assert(roseHasTops(g, v));
+        assert(roseHasTops(build, v));
 
         if (contains(leftfixes, leftfix)) {
             // NFA already built.
@@ -1743,7 +1489,7 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
     }
 
     if (leftfixes.size() > 1) {
-        DEBUG_PRINTF("leftfix size:%lu\n", leftfixes.size());
+        DEBUG_PRINTF("leftfix size:%zu\n", leftfixes.size());
         vector<vector<u32>> groups;
         exclusiveAnalysisInfix(build, vertex_map, roleInfoSet, groups);
         buildExclusiveInfixes(build, bc, qif, infixTriggers, vertex_map,
@@ -1785,7 +1531,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
 
         // Sanity check: our NFA should contain each of the tops mentioned on
         // our in-edges.
-        assert(roseHasTops(g, v));
+        assert(roseHasTops(tbi, v));
 
         bool is_transient = contains(tbi.transient, leftfix);
 
@@ -1877,7 +1623,7 @@ public:
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
         // Unleash the mighty DFA!
-        return getDfa(*rdfa, build.cc, build.rm);
+        return getDfa(*rdfa, false, build.cc, build.rm);
     }
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
@@ -1905,7 +1651,7 @@ public:
             !has_bounded_repeats_other_than_firsts(*n)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
-                auto d = getDfa(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, false, cc, rm);
                 if (d) {
                     n = pickImpl(move(d), move(n));
                 }
@@ -2053,7 +1799,7 @@ void assignSuffixQueues(RoseBuildImpl &build, build_context &bc) {
 
         const suffix_id s(g[v].suffix);
 
-        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].idx, s.graph());
+        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].index, s.graph());
 
         // We may have already built this NFA.
         if (contains(bc.suffixes, s)) {
@@ -2150,7 +1896,7 @@ void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
 
         const suffix_id s(g[v].suffix);
 
-        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].idx, s.graph());
+        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].index, s.graph());
 
         // We may have already built this NFA.
         if (contains(suffixes, s)) {
@@ -2180,7 +1926,7 @@ void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
     }
 
     if (suffixes.size() > 1) {
-        DEBUG_PRINTF("suffix size:%lu\n", suffixes.size());
+        DEBUG_PRINTF("suffix size:%zu\n", suffixes.size());
         vector<vector<u32>> groups;
         exclusiveAnalysisSuffix(tbi, vertex_map, roleInfoSet, groups);
         buildExclusiveSuffixes(tbi, bc, qif, suffixTriggers, vertex_map,
@@ -2240,24 +1986,13 @@ bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-void buildCountingMiracles(RoseBuildImpl &build, build_context &bc) {
+void buildCountingMiracles(build_context &bc) {
     map<pair<CharReach, u8>, u32> pre_built;
 
-    // To ensure compile determinism, we need to iterate over our leftfixes in
-    // a stronger order than directly over bc.leftfix_info.
-    vector<RoseVertex> cm_vertices;
-    for (const auto &m : bc.leftfix_info) {
-        if (m.second.countingMiracleCount) {
-            cm_vertices.push_back(m.first);
+    for (left_build_info &lbi : bc.leftfix_info | map_values) {
+        if (!lbi.countingMiracleCount) {
+            continue;
         }
-    }
-    sort(begin(cm_vertices), end(cm_vertices), VertexIndexComp(build.g));
-
-    DEBUG_PRINTF("%zu vertices with counting miracles\n", cm_vertices.size());
-
-    for (const auto &v : cm_vertices) {
-        auto &lbi = bc.leftfix_info.at(v);
-        assert(lbi.countingMiracleCount);
 
         const CharReach &cr = lbi.countingMiracleReach;
         assert(!cr.all() && !cr.none());
@@ -2275,7 +2010,7 @@ void buildCountingMiracles(RoseBuildImpl &build, build_context &bc) {
             rcm.c = cr.find_first();
         } else {
             rcm.shufti = 1;
-            int rv = shuftiBuildMasks(cr, &rcm.lo, &rcm.hi);
+            int rv = shuftiBuildMasks(cr, (u8 *)&rcm.lo, (u8 *)&rcm.hi);
             if (rv == -1) {
                 DEBUG_PRINTF("failed to build shufti\n");
                 lbi.countingMiracleCount = 0; /* remove counting miracle */
@@ -2287,7 +2022,7 @@ void buildCountingMiracles(RoseBuildImpl &build, build_context &bc) {
 
         rcm.count = lbi.countingMiracleCount;
 
-        lbi.countingMiracleOffset = add_to_engine_blob(bc, rcm);
+        lbi.countingMiracleOffset = bc.engine_blob.add(rcm);
         pre_built[key] = lbi.countingMiracleOffset;
         DEBUG_PRINTF("built cm for count of %u @ %u\n", rcm.count,
                      lbi.countingMiracleOffset);
@@ -2456,24 +2191,6 @@ u32 RoseBuildImpl::calcHistoryRequired() const {
     return m ? m - 1 : 0;
 }
 
-// Adds a sparse iterator to the end of the iterator table, returning its
-// offset.
-static
-u32 addIteratorToTable(build_context &bc,
-                       const vector<mmbit_sparse_iter> &iter) {
-    if (contains(bc.iterCache, iter)) {
-        DEBUG_PRINTF("cache hit\n");
-        u32 offset = bc.iterCache.at(iter);
-        return offset;
-    }
-
-    u32 offset = add_to_engine_blob(bc, iter.begin(), iter.end());
-
-    bc.iterCache.insert(make_pair(iter, offset));
-
-    return offset;
-}
-
 static
 u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
     vector<u32> lb_roles;
@@ -2495,7 +2212,7 @@ u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
 
     vector<mmbit_sparse_iter> iter;
     mmbBuildSparseIterator(iter, lb_roles, bc.numStates);
-    return addIteratorToTable(bc, iter);
+    return bc.engine_blob.add_iterator(iter);
 }
 
 static
@@ -2536,12 +2253,12 @@ u32 findMinFloatingLiteralMatch(const RoseBuildImpl &build,
     u32 minWidth = ROSE_BOUND_INF;
     for (auto v : vertices_range(g)) {
         if (build.isAnchored(v) || build.isVirtualVertex(v)) {
-            DEBUG_PRINTF("skipping %zu anchored or root\n", g[v].idx);
+            DEBUG_PRINTF("skipping %zu anchored or root\n", g[v].index);
             continue;
         }
 
         u32 w = g[v].min_offset;
-        DEBUG_PRINTF("%zu m_o = %u\n", g[v].idx, w);
+        DEBUG_PRINTF("%zu m_o = %u\n", g[v].index, w);
 
         if (w < minWidth) {
             minWidth = w;
@@ -2582,7 +2299,7 @@ void buildSuffixEkeyLists(const RoseBuildImpl &tbi, build_context &bc,
     for (auto &e : qi_to_ekeys) {
         assert(!e.second.empty());
         e.second.push_back(INVALID_EKEY); /* terminator */
-        (*out)[e.first] = add_to_engine_blob(bc, e.second.begin(),
+        (*out)[e.first] = bc.engine_blob.add(e.second.begin(),
                                              e.second.end());
     }
 }
@@ -2607,7 +2324,7 @@ u32 buildEodNfaIterator(build_context &bc, const u32 activeQueueCount) {
 
     vector<mmbit_sparse_iter> iter;
     mmbBuildSparseIterator(iter, keys, activeQueueCount);
-    return addIteratorToTable(bc, iter);
+    return bc.engine_blob.add_iterator(iter);
 }
 
 static
@@ -2770,129 +2487,8 @@ getLiteralInfoByFinalId(const RoseBuildImpl &build, u32 final_id) {
     return out;
 }
 
-/**
- * \brief Flattens a list of role programs into one finalised program with its
- * fail_jump/done_jump targets set correctly.
- */
 static
-vector<RoseInstruction>
-flattenProgram(const vector<vector<RoseInstruction>> &programs) {
-    vector<RoseInstruction> out;
-
-    vector<u32> offsets; // offset of each instruction (bytes)
-    vector<u32> blocks; // track which block we're in
-    vector<u32> block_offsets; // start offsets for each block
-
-    DEBUG_PRINTF("%zu program blocks\n", programs.size());
-
-    size_t curr_offset = 0;
-    for (const auto &program : programs) {
-        DEBUG_PRINTF("block with %zu instructions\n", program.size());
-        block_offsets.push_back(curr_offset);
-        for (const auto &ri : program) {
-            assert(ri.code() != ROSE_INSTR_END);
-            out.push_back(ri);
-            offsets.push_back(curr_offset);
-            blocks.push_back(block_offsets.size() - 1);
-            curr_offset += ROUNDUP_N(ri.length(), ROSE_INSTR_MIN_ALIGN);
-        }
-    }
-
-    // Add a final END instruction, which is its own block.
-    out.emplace_back(ROSE_INSTR_END);
-    block_offsets.push_back(curr_offset);
-    offsets.push_back(curr_offset);
-
-    assert(offsets.size() == out.size());
-
-    for (size_t i = 0; i < out.size(); i++) {
-        auto &ri = out[i];
-
-        u32 jump_target = 0;
-        switch (ri.target) {
-        case JumpTarget::NO_JUMP:
-        case JumpTarget::FIXUP_DONE:
-            continue; // Next instruction.
-        case JumpTarget::PROGRAM_END:
-            assert(i != out.size() - 1);
-            jump_target = offsets.back();
-            break;
-        case JumpTarget::NEXT_BLOCK:
-            assert(blocks[i] + 1 < block_offsets.size());
-            jump_target = block_offsets[blocks[i] + 1];
-            break;
-        }
-
-        // We currently always make progress and never jump backwards.
-        assert(jump_target > offsets[i]);
-        assert(jump_target <= offsets.back());
-        u32 jump_val = jump_target - offsets[i];
-
-        switch (ri.code()) {
-        case ROSE_INSTR_ANCHORED_DELAY:
-            ri.u.anchoredDelay.done_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_ONLY_EOD:
-            ri.u.checkOnlyEod.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_BOUNDS:
-            ri.u.checkBounds.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_NOT_HANDLED:
-            ri.u.checkNotHandled.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_LOOKAROUND:
-            ri.u.checkLookaround.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_MASK:
-            ri.u.checkMask.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_BYTE:
-            ri.u.checkByte.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_INFIX:
-            ri.u.checkInfix.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_PREFIX:
-            ri.u.checkPrefix.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_DEDUPE:
-            ri.u.dedupe.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_DEDUPE_SOM:
-            ri.u.dedupeSom.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_DEDUPE_AND_REPORT:
-            ri.u.dedupeAndReport.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_EXHAUSTED:
-            ri.u.checkExhausted.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_MIN_LENGTH:
-            ri.u.checkMinLength.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_CHECK_STATE:
-            ri.u.checkState.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_SPARSE_ITER_BEGIN:
-            ri.u.sparseIterBegin.fail_jump = jump_val;
-            break;
-        case ROSE_INSTR_SPARSE_ITER_NEXT:
-            ri.u.sparseIterNext.fail_jump = jump_val;
-            break;
-        default:
-            assert(0); // Unhandled opcode?
-            break;
-        }
-
-        ri.target = JumpTarget::FIXUP_DONE;
-    }
-
-    return out;
-}
-
-static
-void applyFinalSpecialisation(vector<RoseInstruction> &program) {
+void applyFinalSpecialisation(RoseProgram &program) {
     assert(!program.empty());
     assert(program.back().code() == ROSE_INSTR_END);
     if (program.size() < 2) {
@@ -2901,26 +2497,18 @@ void applyFinalSpecialisation(vector<RoseInstruction> &program) {
 
     /* Replace the second-to-last instruction (before END) with a one-shot
      * specialisation if available. */
-    auto &ri = *(next(program.rbegin()));
-    switch (ri.code()) {
-    case ROSE_INSTR_REPORT: {
+    auto it = next(program.rbegin());
+    if (auto *ri = dynamic_cast<const RoseInstrReport *>(it->get())) {
         DEBUG_PRINTF("replacing REPORT with FINAL_REPORT\n");
-        auto ri2 = RoseInstruction(ROSE_INSTR_FINAL_REPORT);
-        ri2.u.finalReport.onmatch = ri.u.report.onmatch;
-        ri2.u.finalReport.offset_adjust = ri.u.report.offset_adjust;
-        ri = ri2;
-        break;
-    }
-    default:
-        break;
+        program.replace(it, make_unique<RoseInstrFinalReport>(
+                                ri->onmatch, ri->offset_adjust));
     }
 }
 
 static
-void recordResources(RoseResources &resources,
-                     const vector<RoseInstruction> &program) {
+void recordResources(RoseResources &resources, const RoseProgram &program) {
     for (const auto &ri : program) {
-        switch (ri.code()) {
+        switch (ri->code()) {
         case ROSE_INSTR_TRIGGER_SUFFIX:
             resources.has_suffixes = true;
             break;
@@ -2942,8 +2530,9 @@ void recordResources(RoseResources &resources,
         case ROSE_INSTR_PUSH_DELAYED:
             resources.has_lit_delay = true;
             break;
-        case ROSE_INSTR_CHECK_LIT_MASK:
-            resources.has_lit_mask = true;
+        case ROSE_INSTR_CHECK_LONG_LIT:
+        case ROSE_INSTR_CHECK_LONG_LIT_NOCASE:
+            resources.has_lit_check = true;
             break;
         default:
             break;
@@ -2978,22 +2567,31 @@ void recordResources(RoseResources &resources,
 }
 
 static
-u32 writeProgram(build_context &bc, const vector<RoseInstruction> &program) {
+void recordLongLiterals(build_context &bc, const RoseProgram &program) {
+    for (const auto &ri : program) {
+        if (const auto *ri_check =
+                dynamic_cast<const RoseInstrCheckLongLit *>(ri.get())) {
+            DEBUG_PRINTF("found CHECK_LITERAL for string '%s'\n",
+                         escapeString(ri_check->literal).c_str());
+            bc.longLiterals.emplace_back(ri_check->literal, false);
+            continue;
+        }
+        if (const auto *ri_check =
+                dynamic_cast<const RoseInstrCheckLongLitNocase *>(ri.get())) {
+            DEBUG_PRINTF("found CHECK_LITERAL_NOCASE for string '%s'\n",
+                         escapeString(ri_check->literal).c_str());
+            bc.longLiterals.emplace_back(ri_check->literal, true);
+        }
+    }
+}
+
+static
+u32 writeProgram(build_context &bc, RoseProgram &&program) {
     if (program.empty()) {
         DEBUG_PRINTF("no program\n");
         return 0;
     }
 
-    assert(program.back().code() == ROSE_INSTR_END);
-    assert(program.size() >= 1);
-
-    // This program must have been flattened; i.e. all check instructions must
-    // have their jump offsets set.
-    assert(all_of(begin(program), end(program), [](const RoseInstruction &ri) {
-        return ri.target == JumpTarget::NO_JUMP ||
-               ri.target == JumpTarget::FIXUP_DONE;
-    }));
-
     auto it = bc.program_cache.find(program);
     if (it != end(bc.program_cache)) {
         DEBUG_PRINTF("reusing cached program at %u\n", it->second);
@@ -3001,21 +2599,15 @@ u32 writeProgram(build_context &bc, const vector<RoseInstruction> &program) {
     }
 
     recordResources(bc.resources, program);
+    recordLongLiterals(bc, program);
 
-    DEBUG_PRINTF("writing %zu instructions\n", program.size());
-    u32 programOffset = 0;
-    for (const auto &ri : program) {
-        u32 offset =
-            add_to_engine_blob(bc, ri.get(), ri.length(), ROSE_INSTR_MIN_ALIGN);
-        DEBUG_PRINTF("code %u len %zu written at offset %u\n", ri.code(),
-                     ri.length(), offset);
-        if (!programOffset) {
-            programOffset = offset;
-        }
-    }
-    DEBUG_PRINTF("program begins at offset %u\n", programOffset);
-    bc.program_cache.emplace(program, programOffset);
-    return programOffset;
+    u32 len = 0;
+    auto prog_bytecode = writeProgram(bc.engine_blob, program, &len);
+    u32 offset = bc.engine_blob.add(prog_bytecode.get(), len,
+                                    ROSE_INSTR_MIN_ALIGN);
+    DEBUG_PRINTF("prog len %u written at offset %u\n", len, offset);
+    bc.program_cache.emplace(move(program), offset);
+    return offset;
 }
 
 static
@@ -3233,8 +2825,7 @@ bool checkReachWithFlip(const CharReach &cr, u8 &andmask,
 }
 
 static
-bool makeRoleByte(const vector<LookEntry> &look,
-                  vector<RoseInstruction> &program) {
+bool makeRoleByte(const vector<LookEntry> &look, RoseProgram &program) {
     if (look.size() == 1) {
         const auto &entry = look[0];
         u8 andmask_u8, cmpmask_u8;
@@ -3244,21 +2835,17 @@ bool makeRoleByte(const vector<LookEntry> &look,
         }
         s32 checkbyte_offset = verify_s32(entry.offset);
         DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_BYTE,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkByte.and_mask = andmask_u8;
-        ri.u.checkByte.cmp_mask = cmpmask_u8;
-        ri.u.checkByte.negation = flip;
-        ri.u.checkByte.offset = checkbyte_offset;
-        program.push_back(ri);
+        const auto *end_inst = program.end_instruction();
+        auto ri = make_unique<RoseInstrCheckByte>(andmask_u8, cmpmask_u8, flip,
+                                                  checkbyte_offset, end_inst);
+        program.add_before_end(move(ri));
         return true;
     }
     return false;
 }
 
 static
-bool makeRoleMask(const vector<LookEntry> &look,
-                  vector<RoseInstruction> &program) {
+bool makeRoleMask(const vector<LookEntry> &look, RoseProgram &program) {
     if (look.back().offset < look.front().offset + 8) {
         s32 base_offset = verify_s32(look.front().offset);
         u64a and_mask = 0;
@@ -3280,21 +2867,287 @@ bool makeRoleMask(const vector<LookEntry> &look,
         }
         DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
                      and_mask, cmp_mask);
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_MASK,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkMask.and_mask = and_mask;
-        ri.u.checkMask.cmp_mask = cmp_mask;
-        ri.u.checkMask.neg_mask = neg_mask;
-        ri.u.checkMask.offset = base_offset;
-        program.push_back(ri);
+        const auto *end_inst = program.end_instruction();
+        auto ri = make_unique<RoseInstrCheckMask>(and_mask, cmp_mask, neg_mask,
+                                                  base_offset, end_inst);
+        program.add_before_end(move(ri));
         return true;
     }
     return false;
 }
 
+static UNUSED
+string convertMaskstoString(u8 *p, int byte_len) {
+    string s;
+    for (int i = 0; i < byte_len; i++) {
+        u8 hi = *p >> 4;
+        u8 lo = *p & 0xf;
+        s += (char)(hi + (hi < 10 ? 48 : 87));
+        s += (char)(lo + (lo < 10 ? 48 : 87));
+        p++;
+    }
+    return s;
+}
+
+static
+bool makeRoleMask32(const vector<LookEntry> &look,
+                    RoseProgram &program) {
+    if (look.back().offset >= look.front().offset + 32) {
+        return false;
+    }
+    s32 base_offset = verify_s32(look.front().offset);
+    array<u8, 32> and_mask, cmp_mask;
+    and_mask.fill(0);
+    cmp_mask.fill(0);
+    u32 neg_mask = 0;
+    for (const auto &entry : look) {
+        u8 andmask_u8, cmpmask_u8, flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8,
+                                cmpmask_u8, flip)) {
+            return false;
+        }
+        u32 shift = entry.offset - base_offset;
+        assert(shift < 32);
+        and_mask[shift] = andmask_u8;
+        cmp_mask[shift] = cmpmask_u8;
+        if (flip) {
+            neg_mask |= 1 << shift;
+        }
+    }
+
+    DEBUG_PRINTF("and_mask %s\n",
+                 convertMaskstoString(and_mask.data(), 32).c_str());
+    DEBUG_PRINTF("cmp_mask %s\n",
+                 convertMaskstoString(cmp_mask.data(), 32).c_str());
+    DEBUG_PRINTF("neg_mask %08x\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrCheckMask32>(and_mask, cmp_mask, neg_mask,
+                                                base_offset, end_inst);
+    program.add_before_end(move(ri));
+    return true;
+}
+
+// Sorting by the size of every bucket.
+// Used in map<u32, vector<s8>, cmpNibble>.
+struct cmpNibble {
+    bool operator()(const u32 data1, const u32 data2) const{
+        u32 size1 = popcount32(data1 >> 16) * popcount32(data1 << 16);
+        u32 size2 = popcount32(data2 >> 16) * popcount32(data2 << 16);
+        return std::tie(size1, data1) < std::tie(size2, data2);
+    }
+};
+
+// Insert all pairs of bucket and offset into buckets.
+static really_inline
+void getAllBuckets(const vector<LookEntry> &look,
+                 map<u32, vector<s8>, cmpNibble> &buckets, u32 &neg_mask) {
+    s32 base_offset = verify_s32(look.front().offset);
+    for (const auto &entry : look) {
+        CharReach cr = entry.reach;
+        // Flip heavy character classes to save buckets.
+        if (cr.count() > 128 ) {
+            cr.flip();
+        } else {
+            neg_mask ^= 1 << (entry.offset - base_offset);
+        }
+        map <u16, u16> lo2hi;
+        // We treat Ascii Table as a 16x16 grid.
+        // Push every row in cr into lo2hi and mark the row number.
+        for (size_t i = cr.find_first(); i != CharReach::npos;) {
+            u8 it_hi = i >> 4;
+            u16 low_encode = 0;
+            while (i != CharReach::npos && (i >> 4) == it_hi) {
+                low_encode |= 1 << (i & 0xf);
+                i = cr.find_next(i);
+            }
+            lo2hi[low_encode] |= 1 << it_hi;
+        }
+        for (const auto &it : lo2hi) {
+            u32 hi_lo = (it.second << 16) | it.first;
+            buckets[hi_lo].push_back(entry.offset);
+        }
+    }
+}
+
+// Once we have a new bucket, we'll try to combine it with all old buckets.
+static really_inline
+void nibUpdate(map<u32, u16> &nib, u32 hi_lo) {
+    u16 hi = hi_lo >> 16;
+    u16 lo = hi_lo & 0xffff;
+    for (const auto pairs : nib) {
+        u32 old = pairs.first;
+        if ((old >> 16) == hi || (old & 0xffff) == lo) {
+            if (!nib[old | hi_lo]) {
+                nib[old | hi_lo] = nib[old] | nib[hi_lo];
+            }
+        }
+    }
+}
+
+static really_inline
+void nibMaskUpdate(array<u8, 32> &mask, u32 data, u8 bit_index) {
+    for (u8 index = 0; data > 0; data >>= 1, index++) {
+        if (data & 1) {
+            // 0 ~ 7 bucket in first 16 bytes,
+            // 8 ~ 15 bucket in second 16 bytes.
+            if (bit_index >= 8) {
+                mask[index + 16] |= 1 << (bit_index - 8);
+            } else {
+                mask[index] |= 1 << bit_index;
+            }
+        }
+    }
+}
+
+static
+bool makeRoleShufti(const vector<LookEntry> &look,
+                    RoseProgram &program) {
+
+    s32 base_offset = verify_s32(look.front().offset);
+    if (look.back().offset >= base_offset + 32) {
+        return false;
+    }
+    array<u8, 32> hi_mask, lo_mask;
+    hi_mask.fill(0);
+    lo_mask.fill(0);
+    array<u8, 32> bucket_select_hi, bucket_select_lo;
+    bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
+    bucket_select_lo.fill(0);
+    u8 bit_index = 0; // number of buckets
+    map<u32, u16> nib; // map every bucket to its bucket number.
+    map<u32, vector<s8>, cmpNibble> bucket2offsets;
+    u32 neg_mask = ~0u;
+
+    getAllBuckets(look, bucket2offsets, neg_mask);
+
+    for (const auto &it : bucket2offsets) {
+        u32 hi_lo = it.first;
+        // New bucket.
+        if (!nib[hi_lo]) {
+            if (bit_index >= 16) {
+                return false;
+            }
+            nib[hi_lo] = 1 << bit_index;
+
+            nibUpdate(nib, hi_lo);
+            nibMaskUpdate(hi_mask, hi_lo >> 16, bit_index);
+            nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_index);
+            bit_index++;
+        }
+
+        DEBUG_PRINTF("hi_lo %x bucket %x\n", hi_lo, nib[hi_lo]);
+
+        // Update bucket_select_mask.
+        u8 nib_hi = nib[hi_lo] >> 8;
+        u8 nib_lo = nib[hi_lo] & 0xff;
+        for (const auto offset : it.second) {
+            bucket_select_hi[offset - base_offset] |= nib_hi;
+            bucket_select_lo[offset - base_offset] |= nib_lo;
+        }
+    }
+
+    DEBUG_PRINTF("hi_mask %s\n",
+                 convertMaskstoString(hi_mask.data(), 32).c_str());
+    DEBUG_PRINTF("lo_mask %s\n",
+                 convertMaskstoString(lo_mask.data(), 32).c_str());
+    DEBUG_PRINTF("bucket_select_hi %s\n",
+                 convertMaskstoString(bucket_select_hi.data(), 32).c_str());
+    DEBUG_PRINTF("bucket_select_lo %s\n",
+                 convertMaskstoString(bucket_select_lo.data(), 32).c_str());
+
+    const auto *end_inst = program.end_instruction();
+    if (bit_index < 8) {
+        if (look.back().offset < base_offset + 16) {
+            neg_mask &= 0xffff;
+            array<u8, 32> nib_mask;
+            array<u8, 16> bucket_select_mask_16;
+            copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
+            copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
+            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
+                 bucket_select_mask_16.begin());
+            auto ri = make_unique<RoseInstrCheckShufti16x8>
+                      (nib_mask, bucket_select_mask_16,
+                       neg_mask, base_offset, end_inst);
+            program.add_before_end(move(ri));
+        } else {
+            array<u8, 16> hi_mask_16;
+            array<u8, 16> lo_mask_16;
+            copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
+            copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
+            auto ri = make_unique<RoseInstrCheckShufti32x8>
+                      (hi_mask_16, lo_mask_16, bucket_select_lo,
+                       neg_mask, base_offset, end_inst);
+            program.add_before_end(move(ri));
+        }
+    } else {
+        if (look.back().offset < base_offset + 16) {
+            neg_mask &= 0xffff;
+            array<u8, 32> bucket_select_mask_32;
+            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
+                 bucket_select_mask_32.begin());
+            copy(bucket_select_hi.begin(), bucket_select_hi.begin() + 16,
+                 bucket_select_mask_32.begin() + 16);
+            auto ri = make_unique<RoseInstrCheckShufti16x16>
+                      (hi_mask, lo_mask, bucket_select_mask_32,
+                       neg_mask, base_offset, end_inst);
+            program.add_before_end(move(ri));
+        } else {
+            auto ri = make_unique<RoseInstrCheckShufti32x16>
+                      (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
+                       neg_mask, base_offset, end_inst);
+            program.add_before_end(move(ri));
+        }
+    }
+    return true;
+}
+
+/**
+ * Builds a lookaround instruction, or an appropriate specialization if one is
+ * available.
+ */
+static
+void makeLookaroundInstruction(build_context &bc, const vector<LookEntry> &look,
+                               RoseProgram &program) {
+    assert(!look.empty());
+
+    if (makeRoleByte(look, program)) {
+        return;
+    }
+
+    if (look.size() == 1) {
+        s8 offset = look.begin()->offset;
+        u32 look_idx = addLookaround(bc, look);
+        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, look_idx,
+                                                    program.end_instruction());
+        program.add_before_end(move(ri));
+        return;
+    }
+
+    if (makeRoleMask(look, program)) {
+        return;
+    }
+
+    if (makeRoleMask32(look, program)) {
+        return;
+    }
+
+    if (makeRoleShufti(look, program)) {
+        return;
+    }
+
+    u32 look_idx = addLookaround(bc, look);
+    u32 look_count = verify_u32(look.size());
+
+    auto ri = make_unique<RoseInstrCheckLookaround>(look_idx, look_count,
+                                                    program.end_instruction());
+    program.add_before_end(move(ri));
+}
+
 static
 void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                        vector<RoseInstruction> &program) {
+                        RoseProgram &program) {
     if (!build.cc.grey.roseLookaroundMasks) {
         return;
     }
@@ -3317,28 +3170,12 @@ void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         return;
     }
 
-    if (makeRoleByte(look, program)) {
-        return;
-    }
-
-    if (makeRoleMask(look, program)) {
-        return;
-    }
-
-    DEBUG_PRINTF("role has lookaround\n");
-    u32 look_idx = addLookaround(bc, look);
-    u32 look_count = verify_u32(look.size());
-
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_LOOKAROUND,
-                              JumpTarget::NEXT_BLOCK);
-    ri.u.checkLookaround.index = look_idx;
-    ri.u.checkLookaround.count = look_count;
-    program.push_back(ri);
+    makeLookaroundInstruction(bc, look, program);
 }
 
 static
 void makeRoleCheckLeftfix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                          vector<RoseInstruction> &program) {
+                          RoseProgram &program) {
     auto it = bc.leftfix_info.find(v);
     if (it == end(bc.leftfix_info)) {
         return;
@@ -3352,26 +3189,24 @@ void makeRoleCheckLeftfix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
            build.g[v].left.lag <= MAX_STORED_LEFTFIX_LAG);
 
     bool is_prefix = build.isRootSuccessor(v);
+    const auto *end_inst = program.end_instruction();
+
+    unique_ptr<RoseInstruction> ri;
     if (is_prefix) {
-        auto ri =
-            RoseInstruction(ROSE_INSTR_CHECK_PREFIX, JumpTarget::NEXT_BLOCK);
-        ri.u.checkPrefix.queue = lni.queue;
-        ri.u.checkPrefix.lag = build.g[v].left.lag;
-        ri.u.checkPrefix.report = build.g[v].left.leftfix_report;
-        program.push_back(move(ri));
+        ri = make_unique<RoseInstrCheckPrefix>(lni.queue, build.g[v].left.lag,
+                                               build.g[v].left.leftfix_report,
+                                               end_inst);
     } else {
-        auto ri =
-            RoseInstruction(ROSE_INSTR_CHECK_INFIX, JumpTarget::NEXT_BLOCK);
-        ri.u.checkInfix.queue = lni.queue;
-        ri.u.checkInfix.lag = build.g[v].left.lag;
-        ri.u.checkInfix.report = build.g[v].left.leftfix_report;
-        program.push_back(move(ri));
+        ri = make_unique<RoseInstrCheckInfix>(lni.queue, build.g[v].left.lag,
+                                              build.g[v].left.leftfix_report,
+                                              end_inst);
     }
+    program.add_before_end(move(ri));
 }
 
 static
 void makeRoleAnchoredDelay(RoseBuildImpl &build, build_context &bc,
-                           RoseVertex v, vector<RoseInstruction> &program) {
+                           RoseVertex v, RoseProgram &program) {
     // Only relevant for roles that can be triggered by the anchored table.
     if (!build.isAnchored(v)) {
         return;
@@ -3383,36 +3218,34 @@ void makeRoleAnchoredDelay(RoseBuildImpl &build, build_context &bc,
         return;
     }
 
-    auto ri = RoseInstruction(ROSE_INSTR_ANCHORED_DELAY,
-                              JumpTarget::NEXT_BLOCK);
-    ri.u.anchoredDelay.groups = build.g[v].groups;
-    program.push_back(ri);
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrAnchoredDelay>(build.g[v].groups, end_inst);
+    program.add_before_end(move(ri));
 }
 
 static
 void makeDedupe(const RoseBuildImpl &build, const Report &report,
-                vector<RoseInstruction> &report_block) {
-    auto ri = RoseInstruction(ROSE_INSTR_DEDUPE, JumpTarget::NEXT_BLOCK);
-    ri.u.dedupe.quash_som = report.quashSom;
-    ri.u.dedupe.dkey = build.rm.getDkey(report);
-    ri.u.dedupe.offset_adjust = report.offsetAdjust;
-    report_block.push_back(move(ri));
+                RoseProgram &program) {
+    const auto *end_inst = program.end_instruction();
+    auto ri =
+        make_unique<RoseInstrDedupe>(report.quashSom, build.rm.getDkey(report),
+                                     report.offsetAdjust, end_inst);
+    program.add_before_end(move(ri));
 }
 
 static
 void makeDedupeSom(const RoseBuildImpl &build, const Report &report,
-                   vector<RoseInstruction> &report_block) {
-    auto ri = RoseInstruction(ROSE_INSTR_DEDUPE_SOM, JumpTarget::NEXT_BLOCK);
-    ri.u.dedupeSom.quash_som = report.quashSom;
-    ri.u.dedupeSom.dkey = build.rm.getDkey(report);
-    ri.u.dedupeSom.offset_adjust = report.offsetAdjust;
-    report_block.push_back(move(ri));
+                   RoseProgram &program) {
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrDedupeSom>(report.quashSom,
+                                              build.rm.getDkey(report),
+                                              report.offsetAdjust, end_inst);
+    program.add_before_end(move(ri));
 }
 
 static
 void makeCatchup(RoseBuildImpl &build, build_context &bc,
-                 const flat_set<ReportID> &reports,
-                 vector<RoseInstruction> &program) {
+                 const flat_set<ReportID> &reports, RoseProgram &program) {
     if (!bc.needs_catchup) {
         return;
     }
@@ -3430,12 +3263,12 @@ void makeCatchup(RoseBuildImpl &build, build_context &bc,
         return;
     }
 
-    program.emplace_back(ROSE_INSTR_CATCH_UP);
+    program.add_before_end(make_unique<RoseInstrCatchUp>());
 }
 
 static
 void makeCatchupMpv(RoseBuildImpl &build, build_context &bc, ReportID id,
-                    vector<RoseInstruction> &program) {
+                    RoseProgram &program) {
     if (!bc.needs_mpv_catchup) {
         return;
     }
@@ -3445,13 +3278,15 @@ void makeCatchupMpv(RoseBuildImpl &build, build_context &bc, ReportID id,
         return;
     }
 
-    program.emplace_back(ROSE_INSTR_CATCH_UP_MPV);
+    program.add_before_end(make_unique<RoseInstrCatchUpMpv>());
 }
 
 static
 void writeSomOperation(const Report &report, som_operation *op) {
     assert(op);
 
+    memset(op, 0, sizeof(*op));
+
     switch (report.type) {
     case EXTERNAL_CALLBACK_SOM_REL:
         op->type = SOM_EXTERNAL_CALLBACK_REL;
@@ -3521,51 +3356,46 @@ void writeSomOperation(const Report &report, som_operation *op) {
 
 static
 void makeReport(RoseBuildImpl &build, const ReportID id,
-                const bool has_som, vector<RoseInstruction> &program) {
+                const bool has_som, RoseProgram &program) {
     assert(id < build.rm.numReports());
     const Report &report = build.rm.getReport(id);
 
-    vector<RoseInstruction> report_block;
+    RoseProgram report_block;
+    const RoseInstruction *end_inst = report_block.end_instruction();
 
     // Handle min/max offset checks.
     if (report.minOffset > 0 || report.maxOffset < MAX_OFFSET) {
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_BOUNDS,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkBounds.min_bound = report.minOffset;
-        ri.u.checkBounds.max_bound = report.maxOffset;
-        report_block.push_back(move(ri));
+        auto ri = make_unique<RoseInstrCheckBounds>(report.minOffset,
+                                                    report.maxOffset, end_inst);
+        report_block.add_before_end(move(ri));
     }
 
     // If this report has an exhaustion key, we can check it in the program
     // rather than waiting until we're in the callback adaptor.
     if (report.ekey != INVALID_EKEY) {
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_EXHAUSTED,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkExhausted.ekey = report.ekey;
-        report_block.push_back(move(ri));
+        auto ri = make_unique<RoseInstrCheckExhausted>(report.ekey, end_inst);
+        report_block.add_before_end(move(ri));
     }
 
     // External SOM reports that aren't passthrough need their SOM value
     // calculated.
     if (isExternalSomReport(report) &&
         report.type != EXTERNAL_CALLBACK_SOM_PASS) {
-        auto ri = RoseInstruction(ROSE_INSTR_SOM_FROM_REPORT);
-        writeSomOperation(report, &ri.u.somFromReport.som);
-        report_block.push_back(move(ri));
+        auto ri = make_unique<RoseInstrSomFromReport>();
+        writeSomOperation(report, &ri->som);
+        report_block.add_before_end(move(ri));
     }
 
     // Min length constraint.
     if (report.minLength > 0) {
         assert(build.hasSom);
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_MIN_LENGTH,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkMinLength.end_adj = report.offsetAdjust;
-        ri.u.checkMinLength.min_length = report.minLength;
-        report_block.push_back(move(ri));
+        auto ri = make_unique<RoseInstrCheckMinLength>(
+            report.offsetAdjust, report.minLength, end_inst);
+        report_block.add_before_end(move(ri));
     }
 
     if (report.quashSom) {
-        report_block.emplace_back(ROSE_INSTR_SOM_ZERO);
+        report_block.add_before_end(make_unique<RoseInstrSomZero>());
     }
 
     switch (report.type) {
@@ -3576,42 +3406,30 @@ void makeReport(RoseBuildImpl &build, const ReportID id,
             bool needs_dedupe = build.rm.getDkey(report) != ~0U || build.hasSom;
             if (report.ekey == INVALID_EKEY) {
                 if (needs_dedupe) {
-                    report_block.emplace_back(ROSE_INSTR_DEDUPE_AND_REPORT,
-                                              JumpTarget::NEXT_BLOCK);
-                    auto &ri = report_block.back();
-                    ri.u.dedupeAndReport.quash_som = report.quashSom;
-                    ri.u.dedupeAndReport.dkey = build.rm.getDkey(report);
-                    ri.u.dedupeAndReport.onmatch = report.onmatch;
-                    ri.u.dedupeAndReport.offset_adjust = report.offsetAdjust;
+                    report_block.add_before_end(
+                        make_unique<RoseInstrDedupeAndReport>(
+                            report.quashSom, build.rm.getDkey(report),
+                            report.onmatch, report.offsetAdjust, end_inst));
                 } else {
-                    report_block.emplace_back(ROSE_INSTR_REPORT);
-                    auto &ri = report_block.back();
-                    ri.u.report.onmatch = report.onmatch;
-                    ri.u.report.offset_adjust = report.offsetAdjust;
+                    report_block.add_before_end(make_unique<RoseInstrReport>(
+                        report.onmatch, report.offsetAdjust));
                 }
             } else {
                 if (needs_dedupe) {
                     makeDedupe(build, report, report_block);
                 }
-                report_block.emplace_back(ROSE_INSTR_REPORT_EXHAUST);
-                auto &ri = report_block.back();
-                ri.u.reportExhaust.onmatch = report.onmatch;
-                ri.u.reportExhaust.offset_adjust = report.offsetAdjust;
-                ri.u.reportExhaust.ekey = report.ekey;
+                report_block.add_before_end(make_unique<RoseInstrReportExhaust>(
+                    report.onmatch, report.offsetAdjust, report.ekey));
             }
         } else { // has_som
             makeDedupeSom(build, report, report_block);
             if (report.ekey == INVALID_EKEY) {
-                report_block.emplace_back(ROSE_INSTR_REPORT_SOM);
-                auto &ri = report_block.back();
-                ri.u.reportSom.onmatch = report.onmatch;
-                ri.u.reportSom.offset_adjust = report.offsetAdjust;
+                report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                    report.onmatch, report.offsetAdjust));
             } else {
-                report_block.emplace_back(ROSE_INSTR_REPORT_SOM_EXHAUST);
-                auto &ri = report_block.back();
-                ri.u.reportSomExhaust.onmatch = report.onmatch;
-                ri.u.reportSomExhaust.offset_adjust = report.offsetAdjust;
-                ri.u.reportSomExhaust.ekey = report.ekey;
+                report_block.add_before_end(
+                    make_unique<RoseInstrReportSomExhaust>(
+                        report.onmatch, report.offsetAdjust, report.ekey));
             }
         }
         break;
@@ -3627,20 +3445,18 @@ void makeReport(RoseBuildImpl &build, const ReportID id,
     case INTERNAL_SOM_LOC_SET_FROM:
     case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE:
         if (has_som) {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM_AWARE);
-            auto &ri = report_block.back();
-            writeSomOperation(report, &ri.u.reportSomAware.som);
+            auto ri = make_unique<RoseInstrReportSomAware>();
+            writeSomOperation(report, &ri->som);
+            report_block.add_before_end(move(ri));
         } else {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM_INT);
-            auto &ri = report_block.back();
-            writeSomOperation(report, &ri.u.reportSomInt.som);
+            auto ri = make_unique<RoseInstrReportSomInt>();
+            writeSomOperation(report, &ri->som);
+            report_block.add_before_end(move(ri));
         }
         break;
     case INTERNAL_ROSE_CHAIN: {
-        report_block.emplace_back(ROSE_INSTR_REPORT_CHAIN);
-        auto &ri = report_block.back();
-        ri.u.reportChain.event = report.onmatch;
-        ri.u.reportChain.top_squash_distance = report.topSquashDistance;
+        report_block.add_before_end(make_unique<RoseInstrReportChain>(
+            report.onmatch, report.topSquashDistance));
         break;
     }
     case EXTERNAL_CALLBACK_SOM_REL:
@@ -3649,31 +3465,21 @@ void makeReport(RoseBuildImpl &build, const ReportID id,
     case EXTERNAL_CALLBACK_SOM_REV_NFA:
         makeDedupeSom(build, report, report_block);
         if (report.ekey == INVALID_EKEY) {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM);
-            auto &ri = report_block.back();
-            ri.u.reportSom.onmatch = report.onmatch;
-            ri.u.reportSom.offset_adjust = report.offsetAdjust;
+            report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report.onmatch, report.offsetAdjust));
         } else {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM_EXHAUST);
-            auto &ri = report_block.back();
-            ri.u.reportSomExhaust.onmatch = report.onmatch;
-            ri.u.reportSomExhaust.offset_adjust = report.offsetAdjust;
-            ri.u.reportSomExhaust.ekey = report.ekey;
+            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
+                report.onmatch, report.offsetAdjust, report.ekey));
         }
         break;
     case EXTERNAL_CALLBACK_SOM_PASS:
         makeDedupeSom(build, report, report_block);
         if (report.ekey == INVALID_EKEY) {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM);
-            auto &ri = report_block.back();
-            ri.u.reportSom.onmatch = report.onmatch;
-            ri.u.reportSom.offset_adjust = report.offsetAdjust;
+            report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report.onmatch, report.offsetAdjust));
         } else {
-            report_block.emplace_back(ROSE_INSTR_REPORT_SOM_EXHAUST);
-            auto &ri = report_block.back();
-            ri.u.reportSomExhaust.onmatch = report.onmatch;
-            ri.u.reportSomExhaust.offset_adjust = report.offsetAdjust;
-            ri.u.reportSomExhaust.ekey = report.ekey;
+            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
+                report.onmatch, report.offsetAdjust, report.ekey));
         }
         break;
 
@@ -3683,15 +3489,12 @@ void makeReport(RoseBuildImpl &build, const ReportID id,
     }
 
     assert(!report_block.empty());
-    report_block = flattenProgram({report_block});
-    assert(report_block.back().code() == ROSE_INSTR_END);
-    report_block.pop_back();
-    insert(&program, program.end(), report_block);
+    program.add_block(move(report_block));
 }
 
 static
 void makeRoleReports(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                     vector<RoseInstruction> &program) {
+                     RoseProgram &program) {
     const auto &g = build.g;
 
     /* we are a suffaig - need to update role to provide som to the
@@ -3700,29 +3503,28 @@ void makeRoleReports(RoseBuildImpl &build, build_context &bc, RoseVertex v,
     if (g[v].left.tracksSom()) {
         assert(contains(bc.leftfix_info, v));
         const left_build_info &lni = bc.leftfix_info.at(v);
-        auto ri = RoseInstruction(ROSE_INSTR_SOM_LEFTFIX);
-        ri.u.somLeftfix.queue = lni.queue;
-        ri.u.somLeftfix.lag = g[v].left.lag;
-        program.push_back(ri);
+        program.add_before_end(
+            make_unique<RoseInstrSomLeftfix>(lni.queue, g[v].left.lag));
         has_som = true;
     } else if (g[v].som_adjust) {
-        auto ri = RoseInstruction(ROSE_INSTR_SOM_ADJUST);
-        ri.u.somAdjust.distance = g[v].som_adjust;
-        program.push_back(ri);
+        program.add_before_end(
+            make_unique<RoseInstrSomAdjust>(g[v].som_adjust));
         has_som = true;
     }
 
     const auto &reports = g[v].reports;
     makeCatchup(build, bc, reports, program);
 
+    RoseProgram report_block;
     for (ReportID id : reports) {
-        makeReport(build, id, has_som, program);
+        makeReport(build, id, has_som, report_block);
     }
+    program.add_before_end(move(report_block));
 }
 
 static
 void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                    vector<RoseInstruction> &program) {
+                    RoseProgram &program) {
     const auto &g = build.g;
     if (!g[v].suffix) {
         return;
@@ -3736,7 +3538,7 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         auto tamaProto = g[v].suffix.tamarama.get();
         assert(tamaProto);
         u32 top = (u32)MQE_TOP_FIRST +
-                  tamaProto->top_remap.at(make_pair(g[v].idx,
+                  tamaProto->top_remap.at(make_pair(g[v].index,
                                                     g[v].suffix.top));
         assert(top < MQE_INVALID);
         suffixEvent = top;
@@ -3751,15 +3553,13 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         assert(!g[v].suffix.graph || onlyOneTop(*g[v].suffix.graph));
         suffixEvent = MQE_TOP;
     }
-    auto ri = RoseInstruction(ROSE_INSTR_TRIGGER_SUFFIX);
-    ri.u.triggerSuffix.queue = qi;
-    ri.u.triggerSuffix.event = suffixEvent;
-    program.push_back(ri);
+    program.add_before_end(
+        make_unique<RoseInstrTriggerSuffix>(qi, suffixEvent));
 }
 
 static
 void makeRoleGroups(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                    vector<RoseInstruction> &program) {
+                    RoseProgram &program) {
     const auto &g = build.g;
     rose_group groups = g[v].groups;
     if (!groups) {
@@ -3790,17 +3590,15 @@ void makeRoleGroups(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         return;
     }
 
-    auto ri = RoseInstruction(ROSE_INSTR_SET_GROUPS);
-    ri.u.setGroups.groups = groups;
-    program.push_back(ri);
+    program.add_before_end(make_unique<RoseInstrSetGroups>(groups));
 }
 
 static
 void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
-                           RoseVertex u, vector<RoseInstruction> &program) {
+                           RoseVertex u, RoseProgram &program) {
     const auto &g = build.g;
 
-    vector<RoseInstruction> infix_program;
+    vector<RoseInstrTriggerInfix> infix_program;
 
     for (const auto &e : out_edges_range(u, g)) {
         RoseVertex v = target(e, g);
@@ -3822,7 +3620,7 @@ void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
             auto tamaProto = g[v].left.tamarama.get();
             assert(tamaProto);
             top = MQE_TOP_FIRST + tamaProto->top_remap.at(
-                                      make_pair(g[v].idx, g[e].rose_top));
+                                      make_pair(g[v].index, g[e].rose_top));
             assert(top < MQE_INVALID);
         } else if (!isMultiTopType(nfa->type)) {
             assert(num_tops(g[v].left) == 1);
@@ -3832,11 +3630,7 @@ void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
             assert(top < MQE_INVALID);
         }
 
-        auto ri = RoseInstruction(ROSE_INSTR_TRIGGER_INFIX);
-        ri.u.triggerInfix.queue = lbi.queue;
-        ri.u.triggerInfix.event = top;
-        ri.u.triggerInfix.cancel = g[e].rose_cancel_prev_top;
-        infix_program.push_back(ri);
+        infix_program.emplace_back(g[e].rose_cancel_prev_top, lbi.queue, top);
     }
 
     if (infix_program.empty()) {
@@ -3844,30 +3638,33 @@ void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
     }
 
     // Order, de-dupe and add instructions to the end of program.
-    sort(begin(infix_program), end(infix_program));
-    unique_copy(begin(infix_program), end(infix_program),
-                back_inserter(program));
+    sort(begin(infix_program), end(infix_program),
+         [](const RoseInstrTriggerInfix &a, const RoseInstrTriggerInfix &b) {
+             return tie(a.cancel, a.queue, a.event) <
+                    tie(b.cancel, b.queue, b.event);
+         });
+    infix_program.erase(unique(begin(infix_program), end(infix_program)),
+                        end(infix_program));
+    for (const auto &ri : infix_program) {
+        program.add_before_end(make_unique<RoseInstrTriggerInfix>(ri));
+    }
 }
 
 static
 void makeRoleSetState(const build_context &bc, RoseVertex v,
-                      vector<RoseInstruction> &program) {
+                      RoseProgram &program) {
     // We only need this instruction if a state index has been assigned to this
     // vertex.
     auto it = bc.roleStateIndices.find(v);
     if (it == end(bc.roleStateIndices)) {
         return;
     }
-
-    u32 idx = it->second;
-    auto ri = RoseInstruction(ROSE_INSTR_SET_STATE);
-    ri.u.setState.index = idx;
-    program.push_back(ri);
+    program.add_before_end(make_unique<RoseInstrSetState>(it->second));
 }
 
 static
 void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
-                         const RoseEdge &e, vector<RoseInstruction> &program) {
+                         const RoseEdge &e, RoseProgram &program) {
     const RoseGraph &g = build.g;
     const RoseVertex u = source(e, g);
 
@@ -3908,19 +3705,14 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
     // than just {length, inf}.
     assert(min_bound > lit_length || max_bound < MAX_OFFSET);
 
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_BOUNDS, JumpTarget::NEXT_BLOCK);
-    ri.u.checkBounds.min_bound = min_bound;
-    ri.u.checkBounds.max_bound = max_bound;
-
-    program.push_back(move(ri));
+    const auto *end_inst = program.end_instruction();
+    program.add_before_end(
+        make_unique<RoseInstrCheckBounds>(min_bound, max_bound, end_inst));
 }
 
 static
 void makeRoleCheckNotHandled(build_context &bc, RoseVertex v,
-                             vector<RoseInstruction> &program) {
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_NOT_HANDLED,
-                              JumpTarget::NEXT_BLOCK);
-
+                             RoseProgram &program) {
     u32 handled_key;
     if (contains(bc.handledKeys, v)) {
         handled_key = bc.handledKeys.at(v);
@@ -3929,19 +3721,21 @@ void makeRoleCheckNotHandled(build_context &bc, RoseVertex v,
         bc.handledKeys.emplace(v, handled_key);
     }
 
-    ri.u.checkNotHandled.key = handled_key;
-
-    program.push_back(move(ri));
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrCheckNotHandled>(handled_key, end_inst);
+    program.add_before_end(move(ri));
 }
 
 static
 void makeRoleEagerEodReports(RoseBuildImpl &build, build_context &bc,
-                             RoseVertex v, vector<RoseInstruction> &program) {
-    vector<RoseInstruction> eod_program;
+                             RoseVertex v, RoseProgram &program) {
+    RoseProgram eod_program;
 
     for (const auto &e : out_edges_range(v, build.g)) {
         if (canEagerlyReportAtEod(build, e)) {
-            makeRoleReports(build, bc, target(e, build.g), eod_program);
+            RoseProgram block;
+            makeRoleReports(build, bc, target(e, build.g), block);
+            eod_program.add_block(move(block));
         }
     }
 
@@ -3952,19 +3746,21 @@ void makeRoleEagerEodReports(RoseBuildImpl &build, build_context &bc,
     if (!onlyAtEod(build, v)) {
         // The rest of our program wasn't EOD anchored, so we need to guard
         // these reports with a check.
-        program.emplace_back(ROSE_INSTR_CHECK_ONLY_EOD, JumpTarget::NEXT_BLOCK);
+        const auto *end_inst = eod_program.end_instruction();
+        eod_program.insert(begin(eod_program),
+                           make_unique<RoseInstrCheckOnlyEod>(end_inst));
     }
 
-    program.insert(end(program), begin(eod_program), end(eod_program));
+    program.add_before_end(move(eod_program));
 }
 
 static
-vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
-                                    const RoseEdge &e) {
+RoseProgram makeProgram(RoseBuildImpl &build, build_context &bc,
+                        const RoseEdge &e) {
     const RoseGraph &g = build.g;
     auto v = target(e, g);
 
-    vector<RoseInstruction> program;
+    RoseProgram program;
 
     // First, add program instructions that enforce preconditions without
     // effects.
@@ -3973,8 +3769,8 @@ vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
 
     if (onlyAtEod(build, v)) {
         DEBUG_PRINTF("only at eod\n");
-        program.push_back(RoseInstruction(ROSE_INSTR_CHECK_ONLY_EOD,
-                                          JumpTarget::NEXT_BLOCK));
+        const auto *end_inst = program.end_instruction();
+        program.add_before_end(make_unique<RoseInstrCheckOnlyEod>(end_inst));
     }
 
     if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
@@ -3984,31 +3780,48 @@ vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
     // This program may be triggered by different predecessors, with different
     // offset bounds. We must ensure we put this check/set operation after the
     // bounds check to deal with this case.
-    if (hasGreaterInDegree(1, v, g)) {
+    if (in_degree(v, g) > 1) {
         makeRoleCheckNotHandled(bc, v, program);
     }
 
     makeRoleLookaround(build, bc, v, program);
     makeRoleCheckLeftfix(build, bc, v, program);
 
-    // Next, we can add program instructions that have effects.
+    // Next, we can add program instructions that have effects. This must be
+    // done as a series of blocks, as some of them (like reports) are
+    // escapable.
 
-    makeRoleReports(build, bc, v, program);
+    RoseProgram effects_block;
 
-    makeRoleInfixTriggers(build, bc, v, program);
+    RoseProgram reports_block;
+    makeRoleReports(build, bc, v, reports_block);
+    effects_block.add_block(move(reports_block));
+
+    RoseProgram infix_block;
+    makeRoleInfixTriggers(build, bc, v, infix_block);
+    effects_block.add_block(move(infix_block));
 
     // Note: SET_GROUPS instruction must be after infix triggers, as an infix
     // going dead may switch off groups.
-    makeRoleGroups(build, bc, v, program);
+    RoseProgram groups_block;
+    makeRoleGroups(build, bc, v, groups_block);
+    effects_block.add_block(move(groups_block));
 
-    makeRoleSuffix(build, bc, v, program);
+    RoseProgram suffix_block;
+    makeRoleSuffix(build, bc, v, suffix_block);
+    effects_block.add_block(move(suffix_block));
 
-    makeRoleSetState(bc, v, program);
+    RoseProgram state_block;
+    makeRoleSetState(bc, v, state_block);
+    effects_block.add_block(move(state_block));
 
     // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
     // the program doesn't have one already).
-    makeRoleEagerEodReports(build, bc, v, program);
+    RoseProgram eod_block;
+    makeRoleEagerEodReports(build, bc, v, eod_block);
+    effects_block.add_block(move(eod_block));
 
+    program.add_before_end(move(effects_block));
     return program;
 }
 
@@ -4024,13 +3837,12 @@ u32 writeBoundaryProgram(RoseBuildImpl &build, build_context &bc,
     // scratch to support it).
 
     const bool has_som = false;
-    vector<RoseInstruction> program;
+    RoseProgram program;
     for (const auto &id : reports) {
         makeReport(build, id, has_som, program);
     }
-    program = flattenProgram({program});
     applyFinalSpecialisation(program);
-    return writeProgram(bc, program);
+    return writeProgram(bc, move(program));
 }
 
 static
@@ -4153,7 +3965,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 
             if (hasUsefulStops(lbi)) {
                 assert(lbi.stopAlphabet.size() == N_CHARS);
-                left.stopTable = add_to_engine_blob(bc, lbi.stopAlphabet.begin(),
+                left.stopTable = bc.engine_blob.add(lbi.stopAlphabet.begin(),
                                                     lbi.stopAlphabet.end());
             }
 
@@ -4194,174 +4006,125 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-void addPredBlocksSingle(
-    map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-    vector<RoseInstruction> &program) {
-
-    vector<vector<RoseInstruction>> prog_blocks;
-
-    for (const auto &m : predProgramLists) {
-        const u32 &pred_state = m.first;
-        assert(!m.second.empty());
-        auto subprog = flattenProgram(m.second);
-
-        // Check our pred state.
-        auto ri = RoseInstruction(ROSE_INSTR_CHECK_STATE,
-                                  JumpTarget::NEXT_BLOCK);
-        ri.u.checkState.index = pred_state;
-        subprog.insert(begin(subprog), ri);
-        assert(subprog.back().code() == ROSE_INSTR_END);
-        subprog.pop_back();
-        prog_blocks.push_back(move(subprog));
-    }
-
-    auto prog = flattenProgram(prog_blocks);
-    program.insert(end(program), begin(prog), end(prog));
+void addPredBlockSingle(u32 pred_state, RoseProgram &pred_block,
+                        RoseProgram &program) {
+    // Prepend an instruction to check the pred state is on.
+    const auto *end_inst = pred_block.end_instruction();
+    pred_block.insert(begin(pred_block),
+                      make_unique<RoseInstrCheckState>(pred_state, end_inst));
+    program.add_block(move(pred_block));
 }
 
 static
-u32 programLength(const vector<RoseInstruction> &program) {
-    u32 len = 0;
-    for (const auto &ri : program) {
-        len += ROUNDUP_N(ri.length(), ROSE_INSTR_MIN_ALIGN);
-    }
-    return len;
-}
+void addPredBlocksAny(build_context &bc, map<u32, RoseProgram> &pred_blocks,
+                      RoseProgram &program) {
+    RoseProgram sparse_program;
 
-static
-void addPredBlocksMulti(build_context &bc,
-                    map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-                    vector<RoseInstruction> &program) {
-    assert(!predProgramLists.empty());
-
-    // First, add the iterator itself.
     vector<u32> keys;
-    for (const auto &elem : predProgramLists) {
-        keys.push_back(elem.first);
-    }
-    DEBUG_PRINTF("%zu keys: %s\n", keys.size(), as_string_list(keys).c_str());
-
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, keys, bc.numStates);
-    assert(!iter.empty());
-    u32 iter_offset = addIteratorToTable(bc, iter);
-
-    // Construct our program, starting with the SPARSE_ITER_BEGIN
-    // instruction, keeping track of the jump offset for each sub-program.
-    vector<RoseInstruction> sparse_program;
-    vector<u32> jump_table;
-
-    sparse_program.push_back(RoseInstruction(ROSE_INSTR_SPARSE_ITER_BEGIN,
-                                             JumpTarget::PROGRAM_END));
-    u32 curr_offset = programLength(program) + programLength(sparse_program);
-
-    for (const auto &e : predProgramLists) {
-        DEBUG_PRINTF("subprogram %zu has offset %u\n", jump_table.size(),
-                     curr_offset);
-        jump_table.push_back(curr_offset);
-        assert(!e.second.empty());
-        auto subprog = flattenProgram(e.second);
-
-        if (e.first != keys.back()) {
-            // For all but the last subprogram, replace the END instruction
-            // with a SPARSE_ITER_NEXT.
-            assert(!subprog.empty());
-            assert(subprog.back().code() == ROSE_INSTR_END);
-            subprog.back() = RoseInstruction(ROSE_INSTR_SPARSE_ITER_NEXT,
-                                             JumpTarget::PROGRAM_END);
-        }
-
-        curr_offset += programLength(subprog);
-        insert(&sparse_program, end(sparse_program), subprog);
+    for (const u32 &key : pred_blocks | map_keys) {
+        keys.push_back(key);
     }
 
-    // Strip the END instruction from the last block.
-    assert(sparse_program.back().code() == ROSE_INSTR_END);
-    sparse_program.pop_back();
+    const RoseInstruction *end_inst = sparse_program.end_instruction();
+    auto ri = make_unique<RoseInstrSparseIterAny>(bc.numStates, keys, end_inst);
+    sparse_program.add_before_end(move(ri));
 
-    sparse_program = flattenProgram({sparse_program});
-
-    // Write the jump table into the bytecode.
-    const u32 jump_table_offset =
-        add_to_engine_blob(bc, begin(jump_table), end(jump_table));
-
-    // Write jump table and iterator offset into sparse iter instructions.
-    auto keys_it = begin(keys);
-    for (auto &ri : sparse_program) {
-        switch (ri.code()) {
-        case ROSE_INSTR_SPARSE_ITER_BEGIN:
-            ri.u.sparseIterBegin.iter_offset = iter_offset;
-            ri.u.sparseIterBegin.jump_table = jump_table_offset;
-            break;
-        case ROSE_INSTR_SPARSE_ITER_NEXT:
-            ri.u.sparseIterNext.iter_offset = iter_offset;
-            ri.u.sparseIterNext.jump_table = jump_table_offset;
-            assert(keys_it != end(keys));
-            ri.u.sparseIterNext.state = *keys_it++;
-            break;
-        default:
-            break;
-        }
-    }
-
-    program.insert(end(program), begin(sparse_program), end(sparse_program));
+    RoseProgram &block = pred_blocks.begin()->second;
+    sparse_program.add_before_end(move(block));
+    program.add_block(move(sparse_program));
 }
 
 static
-void addPredBlocks(build_context &bc,
-                   map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-                   vector<RoseInstruction> &program) {
-    const size_t num_preds = predProgramLists.size();
+void addPredBlocksMulti(build_context &bc, map<u32, RoseProgram> &pred_blocks,
+                        RoseProgram &program) {
+    assert(!pred_blocks.empty());
+
+    RoseProgram sparse_program;
+    const RoseInstruction *end_inst = sparse_program.end_instruction();
+    vector<pair<u32, const RoseInstruction *>> jump_table;
+
+    // BEGIN instruction.
+    auto ri_begin =
+        make_unique<RoseInstrSparseIterBegin>(bc.numStates, end_inst);
+    RoseInstrSparseIterBegin *begin_inst = ri_begin.get();
+    sparse_program.add_before_end(move(ri_begin));
+
+    // NEXT instructions, one per pred program.
+    u32 prev_key = pred_blocks.begin()->first;
+    for (auto it = next(begin(pred_blocks)); it != end(pred_blocks); ++it) {
+        auto ri = make_unique<RoseInstrSparseIterNext>(prev_key, begin_inst,
+                                                       end_inst);
+        sparse_program.add_before_end(move(ri));
+        prev_key = it->first;
+    }
+
+    // Splice in each pred program after its BEGIN/NEXT.
+    auto out_it = begin(sparse_program);
+    for (auto &m : pred_blocks) {
+        u32 key = m.first;
+        RoseProgram &flat_prog = m.second;
+        assert(!flat_prog.empty());
+        const size_t block_len = flat_prog.size() - 1; // without INSTR_END.
+
+        assert(dynamic_cast<const RoseInstrSparseIterBegin *>(out_it->get()) ||
+               dynamic_cast<const RoseInstrSparseIterNext *>(out_it->get()));
+        out_it = sparse_program.insert(++out_it, move(flat_prog));
+
+        // Jump table target for this key is the beginning of the block we just
+        // spliced in.
+        jump_table.emplace_back(key, out_it->get());
+
+        assert(distance(begin(sparse_program), out_it) + block_len <=
+               sparse_program.size());
+        advance(out_it, block_len);
+    }
+
+    // Write the jump table back into the SPARSE_ITER_BEGIN instruction.
+    begin_inst->jump_table = move(jump_table);
+
+    program.add_block(move(sparse_program));
+}
+
+static
+void addPredBlocks(build_context &bc, map<u32, RoseProgram> &pred_blocks,
+                   RoseProgram &program) {
+    // Trim empty blocks, if any exist.
+    for (auto it = pred_blocks.begin(); it != pred_blocks.end();) {
+        if (it->second.empty()) {
+            it = pred_blocks.erase(it);
+        } else {
+            ++it;
+        }
+    }
+
+    const size_t num_preds = pred_blocks.size();
     if (num_preds == 0) {
-        program = flattenProgram({program});
         return;
     }
 
     if (num_preds == 1) {
-        addPredBlocksSingle(predProgramLists, program);
+        const auto head = pred_blocks.begin();
+        addPredBlockSingle(head->first, head->second, program);
         return;
     }
 
-    addPredBlocksMulti(bc, predProgramLists, program);
-}
-
-/**
- * Returns the pair (program offset, sparse iter offset).
- */
-static
-vector<RoseInstruction> makeSparseIterProgram(build_context &bc,
-                    map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-                    const vector<RoseInstruction> &root_program,
-                    const vector<RoseInstruction> &pre_program) {
-    vector<RoseInstruction> program;
-    u32 curr_offset = 0;
-
-    // Add pre-program first.
-    for (const auto &ri : pre_program) {
-        program.push_back(ri);
-        curr_offset += ROUNDUP_N(ri.length(), ROSE_INSTR_MIN_ALIGN);
+    // First, see if all our blocks are equivalent, in which case we can
+    // collapse them down into one.
+    const auto &blocks = pred_blocks | map_values;
+    if (all_of(begin(blocks), end(blocks), [&](const RoseProgram &block) {
+            return RoseProgramEquivalence()(*begin(blocks), block);
+        })) {
+        DEBUG_PRINTF("all blocks equiv\n");
+        addPredBlocksAny(bc, pred_blocks, program);
+        return;
     }
 
-    // Add blocks to deal with non-root edges (triggered by sparse iterator or
-    // mmbit_isset checks). This operation will flatten the program up to this
-    // point.
-    addPredBlocks(bc, predProgramLists, program);
-
-    // If we have a root program, replace the END instruction with it. Note
-    // that the root program has already been flattened.
-    assert(!program.empty());
-    assert(program.back().code() == ROSE_INSTR_END);
-    if (!root_program.empty()) {
-        program.pop_back();
-        program.insert(end(program), begin(root_program), end(root_program));
-    }
-
-    return program;
+    addPredBlocksMulti(bc, pred_blocks, program);
 }
 
 static
 void makePushDelayedInstructions(const RoseBuildImpl &build, u32 final_id,
-                                 vector<RoseInstruction> &program) {
+                                 RoseProgram &program) {
     const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
     const auto &arb_lit_info = **lit_infos.begin();
     if (arb_lit_info.delayed_ids.empty()) {
@@ -4376,10 +4139,9 @@ void makePushDelayedInstructions(const RoseBuildImpl &build, u32 final_id,
         DEBUG_PRINTF("final_id=%u delay=%u child_id=%u\n", final_id,
                      child_literal.delay, child_id);
 
-        auto ri = RoseInstruction(ROSE_INSTR_PUSH_DELAYED);
-        ri.u.pushDelayed.delay = verify_u8(child_literal.delay);
-        ri.u.pushDelayed.index = delay_index;
-        program.push_back(move(ri));
+        auto ri = make_unique<RoseInstrPushDelayed>(
+            verify_u8(child_literal.delay), delay_index);
+        program.add_before_end(move(ri));
     }
 }
 
@@ -4397,20 +4159,17 @@ rose_group getFinalIdGroupsUnion(const RoseBuildImpl &build, u32 final_id) {
 
 static
 void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
-                               vector<RoseInstruction> &program) {
+                               RoseProgram &program) {
     rose_group groups = getFinalIdGroupsUnion(build, final_id);
     if (!groups) {
         return;
     }
-
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_GROUPS);
-    ri.u.checkGroups.groups = groups;
-    program.push_back(move(ri));
+    program.add_before_end(make_unique<RoseInstrCheckGroups>(groups));
 }
 
 static
-void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 final_id,
-                                 vector<RoseInstruction> &program) {
+void makeCheckLitMaskInstruction(const RoseBuildImpl &build, build_context &bc,
+                                 u32 final_id, RoseProgram &program) {
     assert(contains(build.final_id_to_literal, final_id));
     const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
     assert(!lit_infos.empty());
@@ -4419,7 +4178,7 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 final_id,
         return;
     }
 
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_LIT_MASK);
+    vector<LookEntry> look;
 
     assert(build.final_id_to_literal.at(final_id).size() == 1);
     u32 lit_id = *build.final_id_to_literal.at(final_id).begin();
@@ -4427,19 +4186,21 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 final_id,
     DEBUG_PRINTF("building mask for lit %u (final id %u) %s\n", lit_id,
                  final_id, dumpString(s).c_str());
     assert(s.length() <= MAX_MASK2_WIDTH);
-    u32 i = 0;
+    s32 i = 0 - s.length();
     for (const auto &e : s) {
-        ri.u.checkLitMask.and_mask.a8[i] = e.nocase ? 0 : CASE_BIT;
-        ri.u.checkLitMask.cmp_mask.a8[i] = e.nocase ? 0 : (CASE_BIT & e.c);
+        if (!e.nocase) {
+            look.emplace_back(verify_s8(i), e);
+        }
         i++;
     }
 
-    program.push_back(move(ri));
+    assert(!look.empty());
+    makeLookaroundInstruction(bc, look, program);
 }
 
 static
 void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
-                                vector<RoseInstruction> &program) {
+                                RoseProgram &program) {
     assert(contains(build.final_id_to_literal, final_id));
     const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
 
@@ -4453,10 +4214,8 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
     }
 
     DEBUG_PRINTF("final_id %u squashes 0x%llx\n", final_id, groups);
-
-    auto ri = RoseInstruction(ROSE_INSTR_SQUASH_GROUPS);
-    ri.u.squashGroups.groups = ~groups; // Negated, so we can just AND it in.
-    program.push_back(move(ri));
+    program.add_before_end(
+        make_unique<RoseInstrSquashGroups>(~groups)); // Note negated.
 }
 
 static
@@ -4475,7 +4234,7 @@ u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
 static
 void makeRecordAnchoredInstruction(const RoseBuildImpl &build,
                                    build_context &bc, u32 final_id,
-                                   vector<RoseInstruction> &program) {
+                                   RoseProgram &program) {
     assert(contains(build.final_id_to_literal, final_id));
     const auto &lit_ids = build.final_id_to_literal.at(final_id);
 
@@ -4497,9 +4256,7 @@ void makeRecordAnchoredInstruction(const RoseBuildImpl &build,
         return;
     }
 
-    auto ri = RoseInstruction(ROSE_INSTR_RECORD_ANCHORED);
-    ri.u.recordAnchored.id = final_id;
-    program.push_back(move(ri));
+    program.add_before_end(make_unique<RoseInstrRecordAnchored>(final_id));
 }
 
 static
@@ -4519,7 +4276,7 @@ static
 void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
                                   u32 final_id,
                                   const vector<RoseEdge> &lit_edges,
-                                  vector<RoseInstruction> &program) {
+                                  RoseProgram &program) {
     if (lit_edges.empty()) {
         return;
     }
@@ -4565,9 +4322,50 @@ void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
     assert(min_offset < UINT32_MAX);
 
     DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
-    auto ri = RoseInstruction(ROSE_INSTR_CHECK_LIT_EARLY);
-    ri.u.checkLitEarly.min_offset = min_offset;
-    program.push_back(move(ri));
+    program.add_before_end(make_unique<RoseInstrCheckLitEarly>(min_offset));
+}
+
+static
+void makeCheckLiteralInstruction(const RoseBuildImpl &build,
+                                 const build_context &bc, u32 final_id,
+                                 RoseProgram &program) {
+    const auto &lits = build.final_id_to_literal.at(final_id);
+    if (lits.size() != 1) {
+        // Long literals should not share a final_id.
+        assert(all_of(begin(lits), end(lits), [&](u32 lit_id) {
+            const rose_literal_id &lit = build.literals.right.at(lit_id);
+            return lit.table != ROSE_FLOATING ||
+                   lit.s.length() <= bc.longLitLengthThreshold;
+        }));
+        return;
+    }
+
+    u32 lit_id = *lits.begin();
+    if (build.isDelayed(lit_id)) {
+        return;
+    }
+
+    const rose_literal_id &lit = build.literals.right.at(lit_id);
+    if (lit.table != ROSE_FLOATING) {
+        return;
+    }
+    assert(bc.longLitLengthThreshold > 0);
+    if (lit.s.length() <= bc.longLitLengthThreshold) {
+        return;
+    }
+
+    // Check resource limits as well.
+    if (lit.s.length() > build.cc.grey.limitLiteralLength) {
+        throw ResourceLimitError();
+    }
+
+    unique_ptr<RoseInstruction> ri;
+    if (lit.s.any_nocase()) {
+        ri = make_unique<RoseInstrCheckLongLitNocase>(lit.s.get_string());
+    } else {
+        ri = make_unique<RoseInstrCheckLongLit>(lit.s.get_string());
+    }
+    program.add_before_end(move(ri));
 }
 
 static
@@ -4585,47 +4383,52 @@ bool hasDelayedLiteral(RoseBuildImpl &build,
 }
 
 static
-vector<RoseInstruction> buildLitInitialProgram(RoseBuildImpl &build,
-                                    build_context &bc, u32 final_id,
-                                    const vector<RoseEdge> &lit_edges) {
-    vector<RoseInstruction> pre_program;
+RoseProgram buildLitInitialProgram(RoseBuildImpl &build, build_context &bc,
+                                   u32 final_id,
+                                   const vector<RoseEdge> &lit_edges) {
+    RoseProgram program;
 
     // No initial program for EOD.
     if (final_id == MO_INVALID_IDX) {
-        return pre_program;
+        return program;
     }
 
     DEBUG_PRINTF("final_id %u\n", final_id);
 
+    // Check long literal info.
+    makeCheckLiteralInstruction(build, bc, final_id, program);
+
     // Check lit mask.
-    makeCheckLitMaskInstruction(build, final_id, pre_program);
+    makeCheckLitMaskInstruction(build, bc, final_id, program);
 
     // Check literal groups. This is an optimisation that we only perform for
     // delayed literals, as their groups may be switched off; ordinarily, we
     // can trust the HWLM matcher.
     if (hasDelayedLiteral(build, lit_edges)) {
-        makeGroupCheckInstruction(build, final_id, pre_program);
+        makeGroupCheckInstruction(build, final_id, program);
     }
 
     // Add instructions for pushing delayed matches, if there are any.
-    makePushDelayedInstructions(build, final_id, pre_program);
+    makePushDelayedInstructions(build, final_id, program);
 
     // Add pre-check for early literals in the floating table.
-    makeCheckLitEarlyInstruction(build, bc, final_id, lit_edges, pre_program);
+    makeCheckLitEarlyInstruction(build, bc, final_id, lit_edges, program);
 
-    return pre_program;
+    return program;
 }
 
 static
-vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
-                                            build_context &bc, u32 final_id,
-                                            const vector<RoseEdge> &lit_edges) {
+RoseProgram buildLiteralProgram(RoseBuildImpl &build, build_context &bc,
+                                u32 final_id,
+                                const vector<RoseEdge> &lit_edges) {
     const auto &g = build.g;
 
     DEBUG_PRINTF("final id %u, %zu lit edges\n", final_id, lit_edges.size());
 
-    // pred state id -> list of programs
-    map<u32, vector<vector<RoseInstruction>>> predProgramLists;
+    RoseProgram program;
+
+    // Predecessor state id -> program block.
+    map<u32, RoseProgram> pred_blocks;
 
     // Construct sparse iter sub-programs.
     for (const auto &e : lit_edges) {
@@ -4633,68 +4436,56 @@ vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
         if (build.isAnyStart(u)) {
             continue; // Root roles are not handled with sparse iterator.
         }
-        DEBUG_PRINTF("sparse iter edge (%zu,%zu)\n", g[u].idx,
-                     g[target(e, g)].idx);
+        DEBUG_PRINTF("sparse iter edge (%zu,%zu)\n", g[u].index,
+                     g[target(e, g)].index);
         assert(contains(bc.roleStateIndices, u));
         u32 pred_state = bc.roleStateIndices.at(u);
-        auto program = makeProgram(build, bc, e);
-        if (program.empty()) {
-            continue;
-        }
-        predProgramLists[pred_state].push_back(program);
+        pred_blocks[pred_state].add_block(makeProgram(build, bc, e));
     }
 
-    // Construct sub-program for handling root roles.
-    vector<vector<RoseInstruction>> root_programs;
+    // Add blocks to deal with non-root edges (triggered by sparse iterator or
+    // mmbit_isset checks).
+    addPredBlocks(bc, pred_blocks, program);
+
+    // Add blocks to handle root roles.
     for (const auto &e : lit_edges) {
         const auto &u = source(e, g);
         if (!build.isAnyStart(u)) {
             continue;
         }
-        DEBUG_PRINTF("root edge (%zu,%zu)\n", g[u].idx, g[target(e, g)].idx);
-        auto role_prog = makeProgram(build, bc, e);
-        if (role_prog.empty()) {
-            continue;
-        }
-        root_programs.push_back(role_prog);
+        DEBUG_PRINTF("root edge (%zu,%zu)\n", g[u].index,
+                     g[target(e, g)].index);
+        program.add_block(makeProgram(build, bc, e));
     }
 
     if (final_id != MO_INVALID_IDX) {
-        vector<RoseInstruction> prog;
+        RoseProgram root_block;
 
         // Literal may squash groups.
-        makeGroupSquashInstruction(build, final_id, prog);
+        makeGroupSquashInstruction(build, final_id, root_block);
 
         // Literal may be anchored and need to be recorded.
-        makeRecordAnchoredInstruction(build, bc, final_id, prog);
+        makeRecordAnchoredInstruction(build, bc, final_id, root_block);
 
-        if (!prog.empty()) {
-            root_programs.push_back(move(prog));
-        }
+        program.add_block(move(root_block));
     }
 
-    vector<RoseInstruction> root_program;
-    if (!root_programs.empty()) {
-        root_program = flattenProgram(root_programs);
-    }
-
-    auto pre_program = buildLitInitialProgram(build, bc, final_id, lit_edges);
-
-    // Put it all together.
-    return makeSparseIterProgram(bc, predProgramLists, root_program,
-                                 pre_program);
+    // Construct initial program up front, as its early checks must be able to
+    // jump to end and terminate processing for this literal.
+    auto lit_program = buildLitInitialProgram(build, bc, final_id, lit_edges);
+    lit_program.add_before_end(move(program));
+    return lit_program;
 }
 
 static
 u32 writeLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
                         const vector<RoseEdge> &lit_edges) {
-    auto program = buildLiteralProgram(build, bc, final_id, lit_edges);
+    RoseProgram program = buildLiteralProgram(build, bc, final_id, lit_edges);
     if (program.empty()) {
         return 0;
     }
-    // Note: already flattened.
     applyFinalSpecialisation(program);
-    return writeProgram(bc, program);
+    return writeProgram(bc, move(program));
 }
 
 static
@@ -4706,13 +4497,12 @@ u32 buildDelayRebuildProgram(RoseBuildImpl &build, build_context &bc,
         return 0; // No delayed IDs, no work to do.
     }
 
-    vector<RoseInstruction> program;
-    makeCheckLitMaskInstruction(build, final_id, program);
+    RoseProgram program;
+    makeCheckLitMaskInstruction(build, bc, final_id, program);
     makePushDelayedInstructions(build, final_id, program);
     assert(!program.empty());
-    program = flattenProgram({program});
     applyFinalSpecialisation(program);
-    return writeProgram(bc, program);
+    return writeProgram(bc, move(program));
 }
 
 static
@@ -4740,8 +4530,8 @@ map<u32, vector<RoseEdge>> findEdgesByLiteral(const RoseBuildImpl &build) {
         auto edge_list = vector<RoseEdge>(begin(m.second), end(m.second));
         sort(begin(edge_list), end(edge_list),
              [&g](const RoseEdge &a, const RoseEdge &b) {
-                 return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
-                        tie(g[source(b, g)].idx, g[target(b, g)].idx);
+                 return tie(g[source(a, g)].index, g[target(a, g)].index) <
+                        tie(g[source(b, g)].index, g[target(b, g)].index);
              });
         lit_edge_map.emplace(m.first, edge_list);
     }
@@ -4773,9 +4563,9 @@ pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
     }
 
     u32 litProgramsOffset =
-        add_to_engine_blob(bc, begin(bc.litPrograms), end(bc.litPrograms));
-    u32 delayRebuildProgramsOffset = add_to_engine_blob(
-        bc, begin(delayRebuildPrograms), end(delayRebuildPrograms));
+        bc.engine_blob.add(begin(bc.litPrograms), end(bc.litPrograms));
+    u32 delayRebuildProgramsOffset = bc.engine_blob.add(
+        begin(delayRebuildPrograms), end(delayRebuildPrograms));
 
     return {litProgramsOffset, delayRebuildProgramsOffset};
 }
@@ -4813,35 +4603,31 @@ pair<u32, u32> buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
     vector<u32> programs;
     programs.reserve(reports.size());
 
-    vector<RoseInstruction> program;
     for (ReportID id : reports) {
-        program.clear();
+        RoseProgram program;
         const bool has_som = false;
         makeCatchupMpv(build, bc, id, program);
         makeReport(build, id, has_som, program);
-        program = flattenProgram({program});
         applyFinalSpecialisation(program);
-        u32 offset = writeProgram(bc, program);
+        u32 offset = writeProgram(bc, move(program));
         programs.push_back(offset);
         build.rm.setProgramOffset(id, offset);
         DEBUG_PRINTF("program for report %u @ %u (%zu instructions)\n", id,
                      programs.back(), program.size());
     }
 
-    u32 offset = add_to_engine_blob(bc, begin(programs), end(programs));
+    u32 offset = bc.engine_blob.add(begin(programs), end(programs));
     u32 count = verify_u32(programs.size());
     return {offset, count};
 }
 
 static
-vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
-                                             build_context &bc,
-                                             const RoseEdge &e,
-                                             const bool multiple_preds) {
+RoseProgram makeEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
+                                 const RoseEdge &e, const bool multiple_preds) {
     const RoseGraph &g = build.g;
     const RoseVertex v = target(e, g);
 
-    vector<RoseInstruction> program;
+    RoseProgram program;
 
     if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
         makeRoleCheckBounds(build, v, e, program);
@@ -4856,9 +4642,11 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
     makeCatchup(build, bc, reports, program);
 
     const bool has_som = false;
+    RoseProgram report_block;
     for (const auto &id : reports) {
-        makeReport(build, id, has_som, program);
+        makeReport(build, id, has_som, report_block);
     }
+    program.add_before_end(move(report_block));
 
     return program;
 }
@@ -4869,7 +4657,7 @@ bool hasEodAnchoredSuffix(const RoseBuildImpl &build) {
     for (auto v : vertices_range(g)) {
         if (g[v].suffix && build.isInETable(v)) {
             DEBUG_PRINTF("vertex %zu is in eod table and has a suffix\n",
-                         g[v].idx);
+                         g[v].index);
             return true;
         }
     }
@@ -4881,7 +4669,7 @@ bool hasEodMatcher(const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
     for (auto v : vertices_range(g)) {
         if (build.isInETable(v)) {
-            DEBUG_PRINTF("vertex %zu is in eod table\n", g[v].idx);
+            DEBUG_PRINTF("vertex %zu is in eod table\n", g[v].index);
             return true;
         }
     }
@@ -4890,30 +4678,30 @@ bool hasEodMatcher(const RoseBuildImpl &build) {
 
 static
 void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
-                         bool in_etable, vector<RoseInstruction> &program) {
+                         bool in_etable, RoseProgram &program) {
     const RoseGraph &g = build.g;
 
-    // pred state id -> list of programs
-    map<u32, vector<vector<RoseInstruction>>> predProgramLists;
+    // Predecessor state id -> program block.
+    map<u32, RoseProgram> pred_blocks;
 
     for (auto v : vertices_range(g)) {
         if (!g[v].eod_accept) {
             continue;
         }
 
-        DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
+        DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].index,
                      in_degree(v, g));
 
         vector<RoseEdge> edge_list;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
             if (build.isInETable(u) != in_etable) {
-                DEBUG_PRINTF("pred %zu %s in etable\n", g[u].idx,
+                DEBUG_PRINTF("pred %zu %s in etable\n", g[u].index,
                              in_etable ? "is not" : "is");
                 continue;
             }
             if (canEagerlyReportAtEod(build, e)) {
-                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
+                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].index);
                 continue;
             }
             edge_list.push_back(e);
@@ -4923,29 +4711,18 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         for (const auto &e : edge_list) {
             RoseVertex u = source(e, g);
             assert(contains(bc.roleStateIndices, u));
-            u32 predStateIdx = bc.roleStateIndices.at(u);
-
-            auto prog = makeEodAnchorProgram(build, bc, e, multiple_preds);
-            if (prog.empty()) {
-                continue;
-            }
-            predProgramLists[predStateIdx].push_back(prog);
+            u32 pred_state = bc.roleStateIndices.at(u);
+            pred_blocks[pred_state].add_block(
+                makeEodAnchorProgram(build, bc, e, multiple_preds));
         }
     }
 
-    if (predProgramLists.empty()) {
-        return;
-    }
-    if (!program.empty()) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        program.pop_back();
-    }
-    addPredBlocks(bc, predProgramLists, program);
+    addPredBlocks(bc, pred_blocks, program);
 }
 
 static
 void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
-                        vector<RoseInstruction> &program) {
+                        RoseProgram &program) {
     if (build.eod_event_literal_id == MO_INVALID_IDX) {
         return;
     }
@@ -4967,65 +4744,51 @@ void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
     // Sort edge list for determinism, prettiness.
     sort(begin(edge_list), end(edge_list),
          [&g](const RoseEdge &a, const RoseEdge &b) {
-             return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
-                    tie(g[source(b, g)].idx, g[target(b, g)].idx);
+             return tie(g[source(a, g)].index, g[target(a, g)].index) <
+                    tie(g[source(b, g)].index, g[target(b, g)].index);
          });
 
-    auto prog = buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
-    program.insert(end(program), begin(prog), end(prog));
+    program.add_block(
+        buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list));
 }
 
 static
-void addEnginesEodProgram(u32 eodNfaIterOffset,
-                          vector<RoseInstruction> &program) {
+void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program) {
     if (!eodNfaIterOffset) {
         return;
     }
 
-    auto ri = RoseInstruction(ROSE_INSTR_ENGINES_EOD);
-    ri.u.enginesEod.iter_offset = eodNfaIterOffset;
-    if (!program.empty()) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        program.pop_back();
-    }
-    program.push_back(move(ri));
-    program.emplace_back(ROSE_INSTR_END);
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrEnginesEod>(eodNfaIterOffset));
+    program.add_block(move(block));
 }
 
 static
-void addSuffixesEodProgram(const RoseBuildImpl &build,
-                           vector<RoseInstruction> &program) {
+void addSuffixesEodProgram(const RoseBuildImpl &build, RoseProgram &program) {
     if (!hasEodAnchoredSuffix(build)) {
         return;
     }
 
-    if (!program.empty()) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        program.pop_back();
-    }
-    program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
-    program.emplace_back(ROSE_INSTR_END);
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrSuffixesEod>());
+    program.add_block(move(block));
 }
 
 static
-void addMatcherEodProgram(const RoseBuildImpl &build,
-                          vector<RoseInstruction> &program) {
+void addMatcherEodProgram(const RoseBuildImpl &build, RoseProgram &program) {
     if (!hasEodMatcher(build)) {
         return;
     }
 
-    if (!program.empty()) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        program.pop_back();
-    }
-    program.emplace_back(ROSE_INSTR_MATCHER_EOD);
-    program.emplace_back(ROSE_INSTR_END);
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrMatcherEod>());
+    program.add_block(move(block));
 }
 
 static
 u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
                     u32 eodNfaIterOffset) {
-    vector<RoseInstruction> program;
+    RoseProgram program;
 
     addEodEventProgram(build, bc, program);
     addEnginesEodProgram(eodNfaIterOffset, program);
@@ -5034,17 +4797,12 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
     addEodAnchorProgram(build, bc, true, program);
     addSuffixesEodProgram(build, program);
 
-    if (program.size() == 1) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        return 0;
-    }
-
     if (program.empty()) {
         return 0;
     }
 
     applyFinalSpecialisation(program);
-    return writeProgram(bc, program);
+    return writeProgram(bc, move(program));
 }
 
 static
@@ -5164,7 +4922,175 @@ u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
 
     vector<mmbit_sparse_iter> iter;
     mmbBuildSparseIterator(iter, vec, queue_count - leftfixBeginQueue);
-    return addIteratorToTable(bc, iter);
+    return bc.engine_blob.add_iterator(iter);
+}
+
+static
+void allocateFinalIdToSet(RoseBuildImpl &build, const set<u32> &lits,
+                          size_t longLitLengthThreshold, u32 *next_final_id) {
+    const auto &g = build.g;
+    auto &literal_info = build.literal_info;
+    auto &final_id_to_literal = build.final_id_to_literal;
+
+    /* We can allocate the same final id to multiple literals of the same type
+     * if they share the same vertex set and trigger the same delayed literal
+     * ids and squash the same roles and have the same group squashing
+     * behaviour. Benefits literals cannot be merged. */
+
+    assert(longLitLengthThreshold > 0);
+
+    for (u32 int_id : lits) {
+        rose_literal_info &curr_info = literal_info[int_id];
+        const rose_literal_id &lit = build.literals.right.at(int_id);
+        const auto &verts = curr_info.vertices;
+
+        // Literals with benefits cannot be merged.
+        if (curr_info.requires_benefits) {
+            DEBUG_PRINTF("id %u has benefits\n", int_id);
+            goto assign_new_id;
+        }
+
+        // Long literals (that require CHECK_LITERAL instructions) cannot be
+        // merged.
+        if (lit.s.length() > longLitLengthThreshold) {
+            DEBUG_PRINTF("id %u is a long literal\n", int_id);
+            goto assign_new_id;
+        }
+
+        if (!verts.empty() && curr_info.delayed_ids.empty()) {
+            vector<u32> cand;
+            insert(&cand, cand.end(), g[*verts.begin()].literals);
+            for (auto v : verts) {
+                vector<u32> temp;
+                set_intersection(cand.begin(), cand.end(),
+                                 g[v].literals.begin(),
+                                 g[v].literals.end(),
+                                 inserter(temp, temp.end()));
+                cand.swap(temp);
+            }
+
+            for (u32 cand_id : cand) {
+                if (cand_id >= int_id) {
+                    break;
+                }
+
+                const auto &cand_info = literal_info[cand_id];
+                const auto &cand_lit = build.literals.right.at(cand_id);
+
+                if (cand_lit.s.length() > longLitLengthThreshold) {
+                    continue;
+                }
+
+                if (cand_info.requires_benefits) {
+                    continue;
+                }
+
+                if (!cand_info.delayed_ids.empty()) {
+                    /* TODO: allow cases where delayed ids are equivalent.
+                     * This is awkward currently as the have not had their
+                     * final ids allocated yet */
+                    continue;
+                }
+
+                if (lits.find(cand_id) == lits.end()
+                    || cand_info.vertices.size() != verts.size()
+                    || cand_info.squash_group != curr_info.squash_group) {
+                    continue;
+                }
+
+                /* if we are squashing groups we need to check if they are the
+                 * same group */
+                if (cand_info.squash_group
+                    && cand_info.group_mask != curr_info.group_mask) {
+                    continue;
+                }
+
+                u32 final_id = cand_info.final_id;
+                assert(final_id != MO_INVALID_IDX);
+                assert(curr_info.final_id == MO_INVALID_IDX);
+                curr_info.final_id = final_id;
+                final_id_to_literal[final_id].insert(int_id);
+                goto next_lit;
+            }
+        }
+
+    assign_new_id:
+        /* oh well, have to give it a fresh one, hang the expense */
+        DEBUG_PRINTF("allocating final id %u to %u\n", *next_final_id, int_id);
+                assert(curr_info.final_id == MO_INVALID_IDX);
+        curr_info.final_id = *next_final_id;
+        final_id_to_literal[*next_final_id].insert(int_id);
+        (*next_final_id)++;
+    next_lit:;
+    }
+}
+
+static
+bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) {
+    assert(lit_id < build.literal_info.size());
+    const auto &info = build.literal_info[lit_id];
+    if (!info.vertices.empty()) {
+        return true;
+    }
+
+    for (const u32 &delayed_id : info.delayed_ids) {
+        assert(delayed_id < build.literal_info.size());
+        const rose_literal_info &delayed_info = build.literal_info[delayed_id];
+        if (!delayed_info.vertices.empty()) {
+            return true;
+        }
+    }
+
+    DEBUG_PRINTF("literal %u has no refs\n", lit_id);
+    return false;
+}
+
+/** \brief Allocate final literal IDs for all literals.  */
+static
+void allocateFinalLiteralId(RoseBuildImpl &build,
+                            size_t longLitLengthThreshold) {
+    set<u32> anch;
+    set<u32> norm;
+    set<u32> delay;
+
+    /* undelayed ids come first */
+    assert(build.final_id_to_literal.empty());
+    u32 next_final_id = 0;
+    for (u32 i = 0; i < build.literal_info.size(); i++) {
+        assert(!build.hasFinalId(i));
+
+        if (!isUsedLiteral(build, i)) {
+            /* what is this literal good for? absolutely nothing */
+            continue;
+        }
+
+        // The special EOD event literal has its own program and does not need
+        // a real literal ID.
+        if (i == build.eod_event_literal_id) {
+            assert(build.eod_event_literal_id != MO_INVALID_IDX);
+            continue;
+        }
+
+        if (build.isDelayed(i)) {
+            assert(!build.literal_info[i].requires_benefits);
+            delay.insert(i);
+        } else if (build.literals.right.at(i).table == ROSE_ANCHORED) {
+            anch.insert(i);
+        } else {
+            norm.insert(i);
+        }
+    }
+
+    /* normal lits */
+    allocateFinalIdToSet(build, norm, longLitLengthThreshold, &next_final_id);
+
+    /* next anchored stuff */
+    build.anchored_base_id = next_final_id;
+    allocateFinalIdToSet(build, anch, longLitLengthThreshold, &next_final_id);
+
+    /* delayed ids come last */
+    build.delay_base_id = next_final_id;
+    allocateFinalIdToSet(build, delay, longLitLengthThreshold, &next_final_id);
 }
 
 static
@@ -5202,17 +5128,90 @@ aligned_unique_ptr<RoseEngine> addSmallWriteEngine(RoseBuildImpl &build,
     return rose2;
 }
 
+/**
+ * \brief Returns the pair (number of literals, max length) for all real
+ * literals in the floating table that are in-use.
+ */
+static
+pair<size_t, size_t> floatingCountAndMaxLen(const RoseBuildImpl &build) {
+    size_t num = 0;
+    size_t max_len = 0;
+
+    for (const auto &e : build.literals.right) {
+        const u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+
+        if (lit.table != ROSE_FLOATING) {
+            continue;
+        }
+        if (lit.delay) {
+            // Skip delayed literals, so that we only count the undelayed
+            // version that ends up in the HWLM table.
+            continue;
+        }
+        if (!isUsedLiteral(build, id)) {
+            continue;
+        }
+
+        num++;
+        max_len = max(max_len, lit.s.length());
+    }
+    DEBUG_PRINTF("%zu floating literals with max_len=%zu\n", num, max_len);
+    return {num, max_len};
+}
+
+size_t calcLongLitThreshold(const RoseBuildImpl &build,
+                            const size_t historyRequired) {
+    const auto &cc = build.cc;
+
+    // In block mode, we should only use the long literal support for literals
+    // that cannot be handled by HWLM.
+    if (!cc.streaming) {
+        return HWLM_LITERAL_MAX_LEN;
+    }
+
+    size_t longLitLengthThreshold = ROSE_LONG_LITERAL_THRESHOLD_MIN;
+
+    // Expand to size of history we've already allocated. Note that we need N-1
+    // bytes of history to match a literal of length N.
+    longLitLengthThreshold = max(longLitLengthThreshold, historyRequired + 1);
+
+    // If we only have one literal, allow for a larger value in order to avoid
+    // building a long literal table for a trivial Noodle case that we could
+    // fit in history.
+    const auto num_len = floatingCountAndMaxLen(build);
+    if (num_len.first == 1) {
+        if (num_len.second > longLitLengthThreshold) {
+            DEBUG_PRINTF("expanding for single literal of length %zu\n",
+                         num_len.second);
+            longLitLengthThreshold = num_len.second;
+        }
+    }
+
+    // Clamp to max history available.
+    longLitLengthThreshold =
+        min(longLitLengthThreshold, size_t{cc.grey.maxHistoryAvailable} + 1);
+
+    return longLitLengthThreshold;
+}
+
 aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     DerivedBoundaryReports dboundary(boundary);
 
     size_t historyRequired = calcHistoryRequired(); // Updated by HWLM.
+    size_t longLitLengthThreshold = calcLongLitThreshold(*this,
+                                                         historyRequired);
+    DEBUG_PRINTF("longLitLengthThreshold=%zu\n", longLitLengthThreshold);
+
+    allocateFinalLiteralId(*this, longLitLengthThreshold);
 
     auto anchored_dfas = buildAnchoredDfas(*this);
 
     build_context bc;
     bc.floatingMinLiteralMatchOffset =
         findMinFloatingLiteralMatch(*this, anchored_dfas);
-    bc.needs_catchup = needsCatchup(*this);
+    bc.longLitLengthThreshold = longLitLengthThreshold;
+    bc.needs_catchup = needsCatchup(*this, anchored_dfas);
     recordResources(bc.resources, *this);
     if (!anchored_dfas.empty()) {
         bc.resources.has_anchored = true;
@@ -5247,7 +5246,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         return nullptr;
     }
     u32 eodNfaIterOffset = buildEodNfaIterator(bc, leftfixBeginQueue);
-    buildCountingMiracles(*this, bc);
+    buildCountingMiracles(bc);
 
     u32 queue_count = qif.allocated_count(); /* excludes anchored matcher q;
                                               * som rev nfas */
@@ -5273,6 +5272,11 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
 
+    size_t longLitStreamStateRequired = 0;
+    u32 longLitTableOffset = buildLongLiteralTable(*this, bc.engine_blob,
+                bc.longLiterals, longLitLengthThreshold, &historyRequired,
+                &longLitStreamStateRequired);
+
     vector<mmbit_sparse_iter> activeLeftIter;
     buildActiveLeftIter(leftInfoTable, activeLeftIter);
 
@@ -5287,13 +5291,12 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     u32 currOffset;  /* relative to base of RoseEngine */
     if (!bc.engine_blob.empty()) {
-        currOffset = bc.engine_blob_base + byte_length(bc.engine_blob);
+        currOffset = bc.engine_blob.base_offset + bc.engine_blob.size();
     } else {
         currOffset = sizeof(RoseEngine);
     }
 
-    UNUSED const size_t engineBlobSize =
-        byte_length(bc.engine_blob); // test later
+    UNUSED const size_t engineBlobSize = bc.engine_blob.size(); // test later
 
     currOffset = ROUNDUP_CL(currOffset);
     DEBUG_PRINTF("currOffset %u\n", currOffset);
@@ -5312,9 +5315,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     // Build floating HWLM matcher.
     rose_group fgroups = 0;
     size_t fsize = 0;
-    size_t floatingStreamStateRequired = 0;
-    auto ftable = buildFloatingMatcher(*this, &fgroups, &fsize, &historyRequired,
-                                       &floatingStreamStateRequired);
+    auto ftable = buildFloatingMatcher(*this, bc.longLitLengthThreshold,
+                                       &fgroups, &fsize, &historyRequired);
     u32 fmatcherOffset = 0;
     if (ftable) {
         currOffset = ROUNDUP_CL(currOffset);
@@ -5387,7 +5389,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     memset(&stateOffsets, 0, sizeof(stateOffsets));
     fillStateOffsets(*this, bc.numStates, anchorStateSize,
                      activeArrayCount, activeLeftCount, laggedRoseCount,
-                     floatingStreamStateRequired, historyRequired,
+                     longLitStreamStateRequired, historyRequired,
                      &stateOffsets);
 
     scatter_plan_raw state_scatter;
@@ -5434,11 +5436,13 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     engine->ekeyCount = rm.numEkeys();
     engine->dkeyCount = rm.numDkeys();
+    engine->dkeyLogSize = fatbit_size(engine->dkeyCount);
     engine->invDkeyOffset = dkeyOffset;
     copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());
 
     engine->somHorizon = ssm.somPrecision();
     engine->somLocationCount = ssm.numSomSlots();
+    engine->somLocationFatbitSize = fatbit_size(engine->somLocationCount);
 
     engine->needsCatchup = bc.needs_catchup ? 1 : 0;
 
@@ -5453,8 +5457,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->activeArrayCount = activeArrayCount;
     engine->activeLeftCount = activeLeftCount;
     engine->queueCount = queue_count;
+    engine->activeQueueArraySize = fatbit_size(queue_count);
     engine->eagerIterOffset = eagerIterOffset;
     engine->handledKeyCount = bc.handledKeys.size();
+    engine->handledKeyFatbitSize = fatbit_size(engine->handledKeyCount);
 
     engine->rolesWithStateCount = bc.numStates;
 
@@ -5474,11 +5480,13 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     engine->lastByteHistoryIterOffset = lastByteOffset;
 
-    u32 delay_count = verify_u32(final_id_to_literal.size() - delay_base_id);
-    engine->delay_count = delay_count;
+    engine->delay_count =
+        verify_u32(final_id_to_literal.size() - delay_base_id);
+    engine->delay_fatbit_size = fatbit_size(engine->delay_count);
     engine->delay_base_id = delay_base_id;
     engine->anchored_base_id = anchored_base_id;
     engine->anchored_count = delay_base_id - anchored_base_id;
+    engine->anchored_fatbit_size = fatbit_size(engine->anchored_count);
 
     engine->rosePrefixCount = rosePrefixCount;
 
@@ -5503,6 +5511,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->ematcherOffset = ematcherOffset;
     engine->sbmatcherOffset = sbmatcherOffset;
     engine->fmatcherOffset = fmatcherOffset;
+    engine->longLitTableOffset = longLitTableOffset;
     engine->amatcherMinWidth = findMinWidth(*this, ROSE_ANCHORED);
     engine->fmatcherMinWidth = findMinWidth(*this, ROSE_FLOATING);
     engine->eodmatcherMinWidth = findMinWidth(*this, ROSE_EOD_ANCHORED);
@@ -5528,7 +5537,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->totalNumLiterals = verify_u32(literal_info.size());
     engine->asize = verify_u32(asize);
     engine->ematcherRegionSize = ematcher_region_size;
-    engine->floatingStreamState = verify_u32(floatingStreamStateRequired);
+    engine->longLitStreamState = verify_u32(longLitStreamStateRequired);
 
     engine->boundary.reportEodOffset = boundary_out.reportEodOffset;
     engine->boundary.reportZeroOffset = boundary_out.reportZeroOffset;
@@ -5545,7 +5554,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
                    &engine->tStateSize);
 
     // Copy in other tables
-    copy_bytes(ptr + bc.engine_blob_base, bc.engine_blob);
+    bc.engine_blob.write_bytes(engine.get());
     copy_bytes(ptr + engine->leftOffset, leftInfoTable);
 
     fillLookaroundTables(ptr + lookaroundTableOffset,
@@ -5556,7 +5565,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     // Safety check: we shouldn't have written anything to the engine blob
     // after we copied it into the engine bytecode.
-    assert(byte_length(bc.engine_blob) == engineBlobSize);
+    assert(bc.engine_blob.size() == engineBlobSize);
 
     // Add a small write engine if appropriate.
     engine = addSmallWriteEngine(*this, move(engine));
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index c65e840d..7987b0f6 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -163,7 +163,7 @@ void renovateCastle(RoseBuildImpl &tbi, CastleProto *castle,
 
     for (RoseVertex v : verts) {
         assert(g[v].left.castle.get() == castle);
-        DEBUG_PRINTF("%zu checks at lag %u\n", g[v].idx, g[v].left.lag);
+        DEBUG_PRINTF("%zu checks at lag %u\n", g[v].index, g[v].left.lag);
         vector<rose_literal_id> lits = literals_for_vertex(tbi, v);
         for (const auto &e : lits) {
             DEBUG_PRINTF("%s +%u\n", dumpString(e.s).c_str(), e.delay);
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 3f82a9cc..e13d7c5c 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -43,11 +43,11 @@
 #include "nfa/nfa_internal.h"
 #include "nfa/rdfa.h"
 #include "nfagraph/ng_holder.h"
-#include "nfagraph/ng_dump.h"
 #include "nfagraph/ng_execute.h"
 #include "nfagraph/ng_is_equal.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_mcclellan.h"
+#include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_repeat.h"
 #include "nfagraph/ng_reports.h"
 #include "nfagraph/ng_stop.h"
@@ -88,172 +88,6 @@ namespace ue2 {
 #define ANCHORED_REHOME_DEEP 25
 #define ANCHORED_REHOME_SHORT_LEN 3
 
-#ifdef DEBUG
-static UNUSED
-void printLitInfo(const rose_literal_info &li, u32 id) {
-    DEBUG_PRINTF("lit_info %u\n", id);
-    DEBUG_PRINTF("  parent %u%s", li.undelayed_id,
-                 li.delayed_ids.empty() ? "":", children:");
-    for (u32 d_id : li.delayed_ids) {
-        printf(" %u", d_id);
-    }
-    printf("\n");
-    DEBUG_PRINTF("  group %llu %s\n", li.group_mask, li.squash_group ? "s":"");
-}
-#endif
-
-static
-void allocateFinalIdToSet(const RoseGraph &g, const set<u32> &lits,
-                          deque<rose_literal_info> *literal_info,
-                          map<u32, set<u32> > *final_id_to_literal,
-                          u32 *next_final_id) {
-    /* We can allocate the same final id to multiple literals of the same type
-     * if they share the same vertex set and trigger the same delayed literal
-     * ids and squash the same roles and have the same group squashing
-     * behaviour. Benefits literals cannot be merged. */
-
-    for (u32 int_id : lits) {
-        rose_literal_info &curr_info = (*literal_info)[int_id];
-        const auto &verts = curr_info.vertices;
-
-        if (!verts.empty() && !curr_info.requires_benefits
-            && curr_info.delayed_ids.empty()) {
-            vector<u32> cand;
-            insert(&cand, cand.end(), g[*verts.begin()].literals);
-            for (auto v : verts) {
-                vector<u32> temp;
-                set_intersection(cand.begin(), cand.end(),
-                                 g[v].literals.begin(),
-                                 g[v].literals.end(),
-                                 inserter(temp, temp.end()));
-                cand.swap(temp);
-            }
-
-            for (u32 cand_id : cand) {
-                if (cand_id >= int_id) {
-                    break;
-                }
-
-                const rose_literal_info &cand_info = (*literal_info)[cand_id];
-
-                if (cand_info.requires_benefits) {
-                    continue;
-                }
-
-                if (!cand_info.delayed_ids.empty()) {
-                    /* TODO: allow cases where delayed ids are equivalent.
-                     * This is awkward currently as the have not had their
-                     * final ids allocated yet */
-                    continue;
-                }
-
-                if (lits.find(cand_id) == lits.end()
-                    || cand_info.vertices.size() != verts.size()
-                    || cand_info.squash_group != curr_info.squash_group) {
-                    continue;
-                }
-
-                /* if we are squashing groups we need to check if they are the
-                 * same group */
-                if (cand_info.squash_group
-                    && cand_info.group_mask != curr_info.group_mask) {
-                    continue;
-                }
-
-                u32 final_id = cand_info.final_id;
-                assert(final_id != MO_INVALID_IDX);
-                assert(curr_info.final_id == MO_INVALID_IDX);
-                curr_info.final_id = final_id;
-                (*final_id_to_literal)[final_id].insert(int_id);
-                goto next_lit;
-            }
-        }
-
-        /* oh well, have to give it a fresh one, hang the expense */
-        DEBUG_PRINTF("allocating final id %u to %u\n", *next_final_id, int_id);
-                assert(curr_info.final_id == MO_INVALID_IDX);
-        curr_info.final_id = *next_final_id;
-        (*final_id_to_literal)[*next_final_id].insert(int_id);
-        (*next_final_id)++;
-    next_lit:;
-    }
-}
-
-static
-bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) {
-    assert(lit_id < build.literal_info.size());
-    const auto &info = build.literal_info[lit_id];
-    if (!info.vertices.empty()) {
-        return true;
-    }
-
-    for (const u32 &delayed_id : info.delayed_ids) {
-        assert(delayed_id < build.literal_info.size());
-        const rose_literal_info &delayed_info = build.literal_info[delayed_id];
-        if (!delayed_info.vertices.empty()) {
-            return true;
-        }
-    }
-
-    DEBUG_PRINTF("literal %u has no refs\n", lit_id);
-    return false;
-}
-
-/** \brief Allocate final literal IDs for all literals.
- *
- * These are the literal ids used in the bytecode.
- */
-static
-void allocateFinalLiteralId(RoseBuildImpl &tbi) {
-    RoseGraph &g = tbi.g;
-
-    set<u32> anch;
-    set<u32> norm;
-    set<u32> delay;
-
-    /* undelayed ids come first */
-    assert(tbi.final_id_to_literal.empty());
-    u32 next_final_id = 0;
-    for (u32 i = 0; i < tbi.literal_info.size(); i++) {
-        assert(!tbi.hasFinalId(i));
-
-        if (!isUsedLiteral(tbi, i)) {
-            /* what is this literal good for? absolutely nothing */
-            continue;
-        }
-
-        // The special EOD event literal has its own program and does not need
-        // a real literal ID.
-        if (i == tbi.eod_event_literal_id) {
-            assert(tbi.eod_event_literal_id != MO_INVALID_IDX);
-            continue;
-        }
-
-        if (tbi.isDelayed(i)) {
-            assert(!tbi.literal_info[i].requires_benefits);
-            delay.insert(i);
-        } else if (tbi.literals.right.at(i).table == ROSE_ANCHORED) {
-            anch.insert(i);
-        } else {
-            norm.insert(i);
-        }
-    }
-
-    /* normal lits */
-    allocateFinalIdToSet(g, norm, &tbi.literal_info, &tbi.final_id_to_literal,
-                         &next_final_id);
-
-    /* next anchored stuff */
-    tbi.anchored_base_id = next_final_id;
-    allocateFinalIdToSet(g, anch, &tbi.literal_info, &tbi.final_id_to_literal,
-                         &next_final_id);
-
-    /* delayed ids come last */
-    tbi.delay_base_id = next_final_id;
-    allocateFinalIdToSet(g, delay, &tbi.literal_info, &tbi.final_id_to_literal,
-                         &next_final_id);
-}
-
 #define MAX_EXPLOSION_NC 3
 static
 bool limited_explosion(const ue2_literal &s) {
@@ -285,7 +119,12 @@ void RoseBuildImpl::handleMixedSensitivity(void) {
             continue;
         }
 
-        if (limited_explosion(lit.s)) {
+        // We don't want to explode long literals, as they require confirmation
+        // with a CHECK_LITERAL instruction and need unique final_ids.
+        // TODO: we could allow explosion for literals where the prefixes
+        // covered by CHECK_LITERAL are identical.
+        if (lit.s.length() <= ROSE_LONG_LITERAL_THRESHOLD_MIN &&
+            limited_explosion(lit.s)) {
             DEBUG_PRINTF("need to explode existing string '%s'\n",
                          dumpString(lit.s).c_str());
             literal_info[id].requires_explode = true;
@@ -366,14 +205,6 @@ bool RoseBuildImpl::hasOnlyPseudoStarInEdges(RoseVertex v) const {
     return true;
 }
 
-void RoseBuildImpl::renumberVertices() {
-    vertexIndex = 0;
-    DEBUG_PRINTF("renumbering vertices\n");
-    for (auto v : vertices_range(g)) {
-        g[v].idx = vertexIndex++;
-    }
-}
-
 static
 size_t trailerDueToSelf(const rose_literal_id &lit) {
     size_t trailer = lit.s.length() - maxPeriod(lit.s);
@@ -392,7 +223,7 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
     const RoseVertex u = source(e, g); /* pred role */
     const RoseVertex v = target(e, g); /* current role */
 
-    DEBUG_PRINTF("find history for [%zu,%zu]\n", g[u].idx, g[v].idx);
+    DEBUG_PRINTF("find history for [%zu,%zu]\n", g[u].index, g[v].index);
     DEBUG_PRINTF("u has min_offset=%u, max_offset=%u\n", g[u].min_offset,
                  g[u].max_offset);
 
@@ -446,7 +277,7 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
     // Non-EOD cases.
 
     DEBUG_PRINTF("examining edge [%zu,%zu] with bounds {%u,%u}\n",
-                 g[u].idx, g[v].idx, g[e].minBound, g[e].maxBound);
+                 g[u].index, g[v].index, g[e].minBound, g[e].maxBound);
 
     if (tbi.isAnchored(v)) {
         // Matches for literals in the anchored table will always arrive at the
@@ -950,19 +781,230 @@ void RoseBuildImpl::findTransientLeftfixes(void) {
 
 /** Find all the different roses and their associated literals. */
 static
-map<left_id, vector<RoseVertex>> findLeftSucc(RoseBuildImpl &tbi) {
+map<left_id, vector<RoseVertex>> findLeftSucc(const RoseBuildImpl &build) {
     map<left_id, vector<RoseVertex>> leftfixes;
-    for (auto v : vertices_range(tbi.g)) {
-        if (tbi.g[v].left) {
-            const LeftEngInfo &lei = tbi.g[v].left;
+    for (auto v : vertices_range(build.g)) {
+        if (build.g[v].left) {
+            const LeftEngInfo &lei = build.g[v].left;
             leftfixes[lei].push_back(v);
         }
     }
     return leftfixes;
 }
 
+namespace {
+struct infix_info {
+    set<RoseVertex> preds;
+    set<RoseVertex> succs;
+};
+}
+
 static
-bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
+map<NGHolder *, infix_info> findInfixGraphInfo(const RoseBuildImpl &build) {
+    map<NGHolder *, infix_info> rv;
+
+    for (auto v : vertices_range(build.g)) {
+        if (!build.g[v].left) {
+            continue;
+        }
+
+        if (build.isRootSuccessor(v)) {
+            DEBUG_PRINTF("a prefix is never an infix\n");
+            continue;
+        }
+
+        /* ensure only proper nfas */
+        const LeftEngInfo &lei = build.g[v].left;
+        if (!lei.graph) {
+            continue;
+        }
+        if (lei.haig || lei.dfa) {
+            continue;
+        }
+        assert(!lei.castle);
+        infix_info &info = rv[lei.graph.get()];
+        insert(&info.preds, inv_adjacent_vertices_range(v, build.g));
+        info.succs.insert(v);
+    }
+
+    return rv;
+}
+
+static
+map<u32, flat_set<NFAEdge>> getTopInfo(const NGHolder &h) {
+    map<u32, flat_set<NFAEdge>> rv;
+    for (NFAEdge e : out_edges_range(h.start, h)) {
+        for (u32 t : h[e].tops) {
+            rv[t].insert(e);
+        }
+    }
+    return rv;
+}
+
+static
+u32 findUnusedTop(const map<u32, flat_set<NFAEdge>> &tops) {
+    u32 i = 0;
+    while (contains(tops, i)) {
+        i++;
+    }
+    return i;
+}
+
+static
+bool reduceTopTriggerLoad(RoseBuildImpl &build, NGHolder &h, RoseVertex u) {
+    RoseGraph &g = build.g;
+
+    set<u32> tops; /* tops triggered by u */
+    for (RoseEdge e : out_edges_range(u, g)) {
+        RoseVertex v = target(e, g);
+        if (g[v].left.graph.get() != &h) {
+            continue;
+        }
+        tops.insert(g[e].rose_top);
+    }
+
+    assert(!tops.empty());
+    if (tops.size() <= 1) {
+        return false;
+    }
+    DEBUG_PRINTF("%zu triggers %zu tops for %p\n", build.g[u].index,
+                 tops.size(), &h);
+
+    auto h_top_info = getTopInfo(h);
+    flat_set<NFAEdge> edges_to_trigger;
+    for (u32 t : tops) {
+        insert(&edges_to_trigger, h_top_info[t]);
+    }
+
+    u32 new_top = ~0U;
+    /* check if there is already a top with the right the successor set */
+    for (const auto &elem : h_top_info) {
+        if (elem.second == edges_to_trigger) {
+            new_top = elem.first;
+            break;
+        }
+    }
+
+    /* if no existing suitable top, add a new top for us */
+    if (new_top == ~0U) {
+        new_top = findUnusedTop(h_top_info);
+
+        /* add top to edges out of start */
+        for (NFAEdge e : out_edges_range(h.start, h)) {
+            if (has_intersection(tops, h[e].tops)) {
+                h[e].tops.insert(new_top);
+            }
+        }
+
+        /* check still implementable if we add a new top */
+        if (!isImplementableNFA(h, nullptr, build.cc)) {
+            DEBUG_PRINTF("unable to add new top\n");
+            for (NFAEdge e : out_edges_range(h.start, h)) {
+                h[e].tops.erase(new_top);
+            }
+            /* we should be back to the original graph */
+            assert(isImplementableNFA(h, nullptr, build.cc));
+            return false;
+        }
+    }
+
+    DEBUG_PRINTF("using new merged top %u\n", new_top);
+    assert(new_top != ~0U);
+    for (RoseEdge e: out_edges_range(u, g)) {
+        RoseVertex v = target(e, g);
+        if (g[v].left.graph.get() != &h) {
+            continue;
+        }
+        g[e].rose_top = new_top;
+    }
+
+    return true;
+}
+
+static
+void packInfixTops(NGHolder &h, RoseGraph &g,
+                   const set<RoseVertex> &verts) {
+    if (!is_triggered(h)) {
+        DEBUG_PRINTF("not triggered, no tops\n");
+        return;
+    }
+    assert(isCorrectlyTopped(h));
+    DEBUG_PRINTF("pruning unused tops\n");
+    flat_set<u32> used_tops;
+    for (auto v : verts) {
+        assert(g[v].left.graph.get() == &h);
+
+        for (const auto &e : in_edges_range(v, g)) {
+            u32 top = g[e].rose_top;
+            used_tops.insert(top);
+        }
+    }
+
+    map<u32, u32> top_mapping;
+    for (u32 t : used_tops) {
+        u32 new_top = top_mapping.size();
+        top_mapping[t] = new_top;
+    }
+
+    for (auto v : verts) {
+        assert(g[v].left.graph.get() == &h);
+
+        for (const auto &e : in_edges_range(v, g)) {
+            g[e].rose_top = top_mapping.at(g[e].rose_top);
+        }
+    }
+
+    vector<NFAEdge> dead;
+    for (const auto &e : out_edges_range(h.start, h)) {
+        NFAVertex v = target(e, h);
+        if (v == h.startDs) {
+            continue; // stylised edge, leave it alone.
+        }
+        flat_set<u32> updated_tops;
+        for (u32 t : h[e].tops) {
+            if (contains(top_mapping, t)) {
+                updated_tops.insert(top_mapping.at(t));
+            }
+        }
+        h[e].tops = move(updated_tops);
+        if (h[e].tops.empty()) {
+            DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
+            dead.push_back(e);
+        }
+    }
+
+    if (dead.empty()) {
+        return;
+    }
+
+    remove_edges(dead, h);
+    pruneUseless(h);
+    clearReports(h); // As we may have removed vacuous edges.
+}
+
+static
+void reduceTopTriggerLoad(RoseBuildImpl &build) {
+    auto infixes = findInfixGraphInfo(build);
+
+    for (auto &p : infixes) {
+        if (onlyOneTop(*p.first)) {
+            continue;
+        }
+
+        bool changed = false;
+        for (RoseVertex v : p.second.preds) {
+            changed |= reduceTopTriggerLoad(build, *p.first, v);
+        }
+
+        if (changed) {
+            packInfixTops(*p.first, build.g, p.second.succs);
+            reduceImplementableGraph(*p.first, SOM_NONE, nullptr, build.cc);
+        }
+    }
+}
+
+static
+bool triggerKillsRoseGraph(const RoseBuildImpl &build, const left_id &left,
                            const set<ue2_literal> &all_lits,
                            const RoseEdge &e) {
     assert(left.graph());
@@ -978,8 +1020,8 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
 
     /* check each pred literal to see if they all kill previous graph
      * state */
-    for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
-        const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
+    for (u32 lit_id : build.g[source(e, build.g)].literals) {
+        const rose_literal_id &pred_lit = build.literals.right.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
 
         DEBUG_PRINTF("running graph %zu\n", states.size());
@@ -995,7 +1037,7 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
 }
 
 static
-bool triggerKillsRose(const RoseBuildImpl &tbi, const left_id &left,
+bool triggerKillsRose(const RoseBuildImpl &build, const left_id &left,
                       const set<ue2_literal> &all_lits, const RoseEdge &e) {
     if (left.haig()) {
         /* TODO: To allow this for som-based engines we would also need to
@@ -1005,32 +1047,30 @@ bool triggerKillsRose(const RoseBuildImpl &tbi, const left_id &left,
     }
 
     if (left.graph()) {
-        return triggerKillsRoseGraph(tbi, left, all_lits, e);
+        return triggerKillsRoseGraph(build, left, all_lits, e);
     }
 
     if (left.castle()) {
-        return triggerKillsRoseCastle(tbi, left, all_lits, e);
+        return triggerKillsRoseCastle(build, left, all_lits, e);
     }
 
     return false;
 }
 
+/* Sometimes the arrival of a top for a rose infix can ensure that the nfa would
+ * be dead at that time. In the case of multiple trigger literals, we can only
+ * base our decision on that portion of literal after any overlapping literals.
+ */
 static
-void inspectRoseTops(RoseBuildImpl &tbi) {
-    /* Sometimes the arrival of a top for a rose infix can ensure that the nfa
-     * would be dead at that time. In the case of multiple trigger literals we
-     * can only base our decision on that portion of literal after any
-     * overlapping literals */
+void findTopTriggerCancels(RoseBuildImpl &build) {
+    auto left_succ = findLeftSucc(build); /* leftfixes -> succ verts */
 
-    map<left_id, vector<RoseVertex>> roses =
-        findLeftSucc(tbi); /* rose -> succ verts */
-
-    for (const auto &r : roses) {
+    for (const auto &r : left_succ) {
         const left_id &left = r.first;
         const vector<RoseVertex> &succs = r.second;
 
         assert(!succs.empty());
-        if (tbi.isRootSuccessor(*succs.begin())) {
+        if (build.isRootSuccessor(*succs.begin())) {
             /* a prefix is never an infix */
             continue;
         }
@@ -1040,10 +1080,10 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
         set<u32> pred_lit_ids;
 
         for (auto v : succs) {
-            for (const auto &e : in_edges_range(v, tbi.g)) {
-                RoseVertex u = source(e, tbi.g);
-                tops_seen.insert(tbi.g[e].rose_top);
-                insert(&pred_lit_ids, tbi.g[u].literals);
+            for (const auto &e : in_edges_range(v, build.g)) {
+                RoseVertex u = source(e, build.g);
+                tops_seen.insert(build.g[e].rose_top);
+                insert(&pred_lit_ids, build.g[u].literals);
                 rose_edges.insert(e);
             }
         }
@@ -1055,7 +1095,7 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
         }
 
         for (u32 lit_id : pred_lit_ids) {
-            const rose_literal_id &p_lit = tbi.literals.right.at(lit_id);
+            const rose_literal_id &p_lit = build.literals.right.at(lit_id);
             if (p_lit.delay || p_lit.table == ROSE_ANCHORED) {
                 goto next_rose;
             }
@@ -1067,15 +1107,22 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
                      all_lits.size(), rose_edges.size());
 
         for (const auto &e : rose_edges) {
-            if (triggerKillsRose(tbi, left, all_lits, e)) {
+            if (triggerKillsRose(build, left, all_lits, e)) {
                 DEBUG_PRINTF("top will override previous rose state\n");
-                tbi.g[e].rose_cancel_prev_top = true;
+                build.g[e].rose_cancel_prev_top = true;
             }
         }
     next_rose:;
     }
 }
 
+static
+void optimiseRoseTops(RoseBuildImpl &build) {
+    reduceTopTriggerLoad(build);
+    /* prune unused tops ? */
+    findTopTriggerCancels(build);
+}
+
 static
 void buildRoseSquashMasks(RoseBuildImpl &tbi) {
     /* Rose nfa squash masks are applied to the groups when the nfa can no
@@ -1256,22 +1303,16 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai,
         assert(old_id < tbi.literal_info.size());
         const rose_literal_info &li = tbi.literal_info[old_id];
 
-        // For compile determinism, operate over literal vertices in index
-        // order.
-        vector<RoseVertex> lit_verts(begin(li.vertices), end(li.vertices));
-        sort(begin(lit_verts), end(lit_verts), VertexIndexComp(g));
-
-        for (auto lit_v : lit_verts) {
+        for (auto lit_v : li.vertices) {
             // Clone vertex with the new literal ID.
             RoseVertex v = add_vertex(g[lit_v], g);
-            g[v].idx = tbi.vertexIndex++;
             g[v].literals.clear();
             g[v].literals.insert(lit_id);
             g[v].min_offset = sai.min_bound + sai.literal.length();
             g[v].max_offset = sai.max_bound + sai.literal.length();
             lit_info.vertices.insert(v);
 
-            RoseEdge e = add_edge(anchored_root, v, g).first;
+            RoseEdge e = add_edge(anchored_root, v, g);
             g[e].minBound = sai.min_bound;
             g[e].maxBound = sai.max_bound;
         }
@@ -1292,11 +1333,10 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const ue2_literal &lit,
     RoseGraph &g = tbi.g;
 
     RoseVertex v = add_vertex(g);
-    g[v].idx = tbi.vertexIndex++;
     g[v].literals.insert(lit_id);
     g[v].reports = reports;
 
-    RoseEdge e = add_edge(tbi.root, v, g).first;
+    RoseEdge e = add_edge(tbi.root, v, g);
     g[e].minBound = 0;
     g[e].maxBound = ROSE_BOUND_INF;
     g[v].min_offset = 1;
@@ -1502,7 +1542,7 @@ bool historiesAreValid(const RoseGraph &g) {
     for (const auto &e : edges_range(g)) {
         if (g[e].history == ROSE_ROLE_HISTORY_INVALID) {
             DEBUG_PRINTF("edge [%zu,%zu] has invalid history\n",
-                         g[source(e, g)].idx, g[target(e, g)].idx);
+                         g[source(e, g)].index, g[target(e, g)].index);
             return false;
         }
     }
@@ -1521,18 +1561,20 @@ bool danglingVertexRef(RoseBuildImpl &tbi) {
     const ue2::unordered_set<RoseVertex> valid_vertices(vi, ve);
 
     if (!contains(valid_vertices, tbi.anchored_root)) {
-        DEBUG_PRINTF("anchored root vertex %p not in graph\n",
-                     tbi.anchored_root);
+        DEBUG_PRINTF("anchored root vertex %zu not in graph\n",
+                     tbi.g[tbi.anchored_root].index);
         return true;
     }
 
     for (const auto &e : tbi.ghost) {
         if (!contains(valid_vertices, e.first)) {
-            DEBUG_PRINTF("ghost key vertex %p not in graph\n", e.first);
+            DEBUG_PRINTF("ghost key vertex %zu not in graph\n",
+                         tbi.g[e.first].index);
             return true;
         }
         if (!contains(valid_vertices, e.second)) {
-            DEBUG_PRINTF("ghost value vertex %p not in graph\n", e.second);
+            DEBUG_PRINTF("ghost value vertex %zu not in graph\n",
+                         tbi.g[e.second].index);
             return true;
         }
     }
@@ -1544,63 +1586,16 @@ static
 bool roleOffsetsAreValid(const RoseGraph &g) {
     for (auto v : vertices_range(g)) {
         if (g[v].min_offset >= ROSE_BOUND_INF) {
-            DEBUG_PRINTF("invalid min_offset for role %zu\n", g[v].idx);
+            DEBUG_PRINTF("invalid min_offset for role %zu\n", g[v].index);
             return false;
         }
         if (g[v].min_offset > g[v].max_offset) {
-            DEBUG_PRINTF("min_offset > max_offset for %zu\n", g[v].idx);
+            DEBUG_PRINTF("min_offset > max_offset for %zu\n", g[v].index);
             return false;
         }
     }
     return true;
 }
-
-static UNUSED
-bool hasOrphanedTops(const RoseBuildImpl &tbi) {
-    const RoseGraph &g = tbi.g;
-
-    ue2::unordered_map<left_id, set<u32> > roses;
-    ue2::unordered_map<suffix_id, set<u32> > suffixes;
-
-    for (auto v : vertices_range(g)) {
-        if (g[v].left) {
-            set<u32> &tops = roses[g[v].left];
-            if (tbi.isRootSuccessor(v)) {
-                // Prefix, has only one top.
-                tops.insert(0);
-            } else {
-                // Tops for infixes come from the in-edges.
-                for (const auto &e : in_edges_range(v, g)) {
-                    tops.insert(g[e].rose_top);
-                }
-            }
-        }
-        if (g[v].suffix) {
-            suffixes[g[v].suffix].insert(g[v].suffix.top);
-        }
-    }
-
-    for (const auto &e : roses) {
-        if (all_tops(e.first) != e.second) {
-            DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n",
-                         as_string_list(all_tops(e.first)).c_str(),
-                         as_string_list(e.second).c_str());
-            return true;
-        }
-    }
-
-    for (const auto &e : suffixes) {
-        if (all_tops(e.first) != e.second) {
-            DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n",
-                         as_string_list(all_tops(e.first)).c_str(),
-                         as_string_list(e.second).c_str());
-            return true;
-        }
-    }
-
-    return false;
-}
-
 #endif // NDEBUG
 
 aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
@@ -1681,13 +1676,17 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
         mergeSmallLeftfixes(*this);
     }
 
+    assert(!hasOrphanedTops(*this));
+
     // Do a rose-merging aliasing pass.
     aliasRoles(*this, true);
+    assert(!hasOrphanedTops(*this));
 
     // Run a merge pass over the outfixes as well.
     mergeOutfixes(*this);
 
     assert(!danglingVertexRef(*this));
+    assert(!hasOrphanedTops(*this));
 
     findMoreLiteralMasks(*this);
 
@@ -1697,8 +1696,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     /* final prep work */
     remapCastleTops(*this);
-    allocateFinalLiteralId(*this);
-    inspectRoseTops(*this);
+    optimiseRoseTops(*this);
     buildRoseSquashMasks(*this);
 
     rm.assignDkeys(this);
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index 1578dda1..b151c0c9 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -163,6 +163,8 @@ unique_ptr<NGHolder> convertLeafToHolder(const RoseGraph &g,
         }
     }
 
+    setTops(*out);
+
     // Literal vertices wired to accept.
     NFAVertex litfirst, litlast;
     tie(litfirst, litlast) = addLiteralVertices(g, literals, t_v, *out);
@@ -288,7 +290,7 @@ bool isUnconvertibleLeaf(const RoseBuildImpl &tbi, const RoseVertex v) {
 
 // Find all of the leaves with literals whose length is <= len.
 static
-void findBadLeaves(RoseBuildImpl &tbi, RoseVertexSet &bad) {
+void findBadLeaves(RoseBuildImpl &tbi, set<RoseVertex> &bad) {
     RoseGraph &g = tbi.g;
     u32 len = tbi.cc.grey.roseMaxBadLeafLength;
 
@@ -307,15 +309,7 @@ void findBadLeaves(RoseBuildImpl &tbi, RoseVertexSet &bad) {
 
         const rose_literal_info &info = tbi.literal_info[lid];
 
-        // Because we do the "clone pred and re-home" trick below, we need to
-        // iterate over our vertices in a defined ordering, otherwise we'll get
-        // non-determinism in our bytecode. So, copy and sort this literal's
-        // vertices.
-
-        vector<RoseVertex> verts(info.vertices.begin(), info.vertices.end());
-        sort(verts.begin(), verts.end(), VertexIndexComp(g));
-
-        for (auto v : verts) {
+        for (auto v : info.vertices) {
             if (!isLeafNode(v, g)) {
                 continue;
             }
@@ -329,7 +323,7 @@ void findBadLeaves(RoseBuildImpl &tbi, RoseVertexSet &bad) {
             const RoseEdge &e = *in_edges(v, g).first;
             RoseVertex u = source(e, g);
             if (out_degree(u, g) != 1) {
-                DEBUG_PRINTF("re-homing %zu to cloned pred\n", g[v].idx);
+                DEBUG_PRINTF("re-homing %zu to cloned pred\n", g[v].index);
                 RoseVertex u2 = tbi.cloneVertex(u);
                 for (const auto &e_in : in_edges_range(u, g)) {
                     add_edge(source(e_in, g), u2, g[e_in], g);
@@ -338,7 +332,7 @@ void findBadLeaves(RoseBuildImpl &tbi, RoseVertexSet &bad) {
                 remove_edge(e, g);
             }
 
-            DEBUG_PRINTF("%zu is a bad leaf vertex\n", g[v].idx);
+            DEBUG_PRINTF("%zu is a bad leaf vertex\n", g[v].index);
             bad.insert(v);
         }
     }
@@ -346,7 +340,7 @@ void findBadLeaves(RoseBuildImpl &tbi, RoseVertexSet &bad) {
 
 void convertBadLeaves(RoseBuildImpl &tbi) {
     RoseGraph &g = tbi.g;
-    RoseVertexSet bad(g);
+    set<RoseVertex> bad;
     findBadLeaves(tbi, bad);
     DEBUG_PRINTF("found %zu bad leaves\n", bad.size());
 
@@ -369,7 +363,7 @@ void convertBadLeaves(RoseBuildImpl &tbi) {
         RoseVertex u = source(e, g);
         assert(!g[u].suffix);
         g[u].suffix.graph = h;
-        DEBUG_PRINTF("%zu's nfa holder %p\n", g[u].idx, h.get());
+        DEBUG_PRINTF("%zu's nfa holder %p\n", g[u].index, h.get());
 
         dead.push_back(v);
     }
@@ -400,7 +394,10 @@ unique_ptr<NGHolder> makeFloodProneSuffix(const ue2_literal &s, size_t len,
     NFAVertex u = h->start;
     for (auto it = s.begin() + s.length() - len; it != s.end(); ++it) {
         NFAVertex v = addHolderVertex(*it, *h);
-        add_edge(u, v, *h);
+        NFAEdge e = add_edge(u, v, *h);
+        if (u == h->start) {
+            (*h)[e].tops.insert(DEFAULT_TOP);
+        }
         u = v;
     }
 
@@ -708,10 +705,7 @@ bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
         assert(g[e_old].maxBound >= bound_max);
         setEdgeBounds(g, e_old, bound_min, bound_max);
     } else {
-        RoseEdge e_new;
-        UNUSED bool added;
-        tie(e_new, added) = add_edge(ar, v, g);
-        assert(added);
+        RoseEdge e_new = add_edge(ar, v, g);
         setEdgeBounds(g, e_new, bound_min, bound_max);
         to_delete->push_back(e_old);
     }
@@ -728,10 +722,8 @@ bool handleStartDsPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     u32 repeatCount = 0;
     NFAVertex hu = h.startDs;
 
-    set<NFAVertex> start_succ;
-    set<NFAVertex> startds_succ;
-    succ(h, h.start, &start_succ);
-    succ(h, h.startDs, &startds_succ);
+    auto start_succ = succs<set<NFAVertex>>(h.start, h);
+    auto startds_succ = succs<set<NFAVertex>>(h.startDs, h);
 
     if (!is_subset_of(start_succ, startds_succ)) {
         DEBUG_PRINTF("not a simple chain\n");
@@ -781,14 +773,12 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     assert(in_degree(h.acceptEod, h) == 1);
 
     bool anchored = !proper_out_degree(h.startDs, h);
-    NFAVertex key = nullptr;
+    NFAVertex key = NGHolder::null_vertex();
     NFAVertex base = anchored ? h.start : h.startDs;
 
     if (!anchored) {
-        set<NFAVertex> start_succ;
-        set<NFAVertex> startds_succ;
-        succ(h, h.start, &start_succ);
-        succ(h, h.startDs, &startds_succ);
+        auto start_succ = succs<set<NFAVertex>>(h.start, h);
+        auto startds_succ = succs<set<NFAVertex>>(h.startDs, h);
 
         if (!is_subset_of(start_succ, startds_succ)) {
             DEBUG_PRINTF("not a simple chain\n");
@@ -797,7 +787,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     }
 
     for (auto w : adjacent_vertices_range(base, h)) {
-        DEBUG_PRINTF("checking %u\n", h[w].index);
+        DEBUG_PRINTF("checking %zu\n", h[w].index);
         if (!h[w].char_reach.all()) {
             continue;
         }
@@ -832,7 +822,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
 
     set<NFAVertex> exits_and_repeat_verts;
     for (auto repeat_v : ri.vertices) {
-        DEBUG_PRINTF("repeat vertex %u\n", h[repeat_v].index);
+        DEBUG_PRINTF("repeat vertex %zu\n", h[repeat_v].index);
         succ(h, repeat_v, &exits_and_repeat_verts);
         exits_and_repeat_verts.insert(repeat_v);
     }
@@ -847,8 +837,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     exits = exits_and_repeat_verts;
     erase_all(&exits, rep_verts);
 
-    set<NFAVertex> base_succ;
-    succ(h, base, &base_succ);
+    auto base_succ = succs<set<NFAVertex>>(base, h);
     base_succ.erase(h.startDs);
 
     if (is_subset_of(base_succ, rep_verts)) {
@@ -908,10 +897,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
         if (source(e_old, g) == ar) {
             setEdgeBounds(g, e_old, ri.repeatMin + width, ri.repeatMax + width);
         } else {
-            RoseEdge e_new;
-            UNUSED bool added;
-            tie(e_new, added) = add_edge(ar, v, g);
-            assert(added);
+            RoseEdge e_new = add_edge(ar, v, g);
             setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width);
             to_delete->push_back(e_old);
         }
@@ -963,7 +949,7 @@ void convertPrefixToBounds(RoseBuildImpl &tbi) {
             continue;
         }
 
-        DEBUG_PRINTF("inspecting prefix of %zu\n", g[v].idx);
+        DEBUG_PRINTF("inspecting prefix of %zu\n", g[v].index);
 
         if (!proper_out_degree(h.startDs, h)) {
             if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) {
@@ -1009,7 +995,7 @@ void convertPrefixToBounds(RoseBuildImpl &tbi) {
             continue;
         }
 
-        DEBUG_PRINTF("inspecting prefix of %zu\n", g[v].idx);
+        DEBUG_PRINTF("inspecting prefix of %zu\n", g[v].index);
 
         if (!proper_out_degree(h.startDs, h)) {
             if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) {
@@ -1044,7 +1030,7 @@ void convertAnchPrefixToBounds(RoseBuildImpl &tbi) {
             continue;
         }
 
-        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
 
         // This pass runs after makeCastles, so we use the fact that bounded
         // repeat detection has already been done for us.
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 5fb27c55..105ee338 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -104,7 +104,7 @@ public:
         }
 
         os << "[label=\"";
-        os << "idx=" << g[v].idx <<"\\n";
+        os << "index=" << g[v].index <<"\\n";
 
         for (u32 lit_id : g[v].literals) {
             writeLiteral(os, lit_id);
@@ -267,14 +267,14 @@ void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
     ofstream os(ss.str());
 
     RoseGraphWriter writer(build, t);
-    writeGraphviz(os, build.g, writer, get(&RoseVertexProps::idx, build.g));
+    writeGraphviz(os, build.g, writer, get(boost::vertex_index, build.g));
 }
 
 namespace {
 struct CompareVertexRole {
     explicit CompareVertexRole(const RoseGraph &g_in) : g(g_in) {}
     inline bool operator()(const RoseVertex &a, const RoseVertex &b) const {
-        return g[a].idx < g[b].idx;
+        return g[a].index < g[b].index;
     }
 private:
     const RoseGraph &g;
@@ -372,7 +372,7 @@ void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
 
         for (RoseVertex v : verts) {
             // role info
-            os << "  Index " << g[v].idx << ": groups=0x" << hex << setw(16)
+            os << "  Index " << g[v].index << ": groups=0x" << hex << setw(16)
                << setfill('0') << g[v].groups << dec;
 
             if (g[v].reports.empty()) {
@@ -386,13 +386,13 @@ void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
             // pred info
             for (const auto &ie : in_edges_range(v, g)) {
                 const auto &u = source(ie, g);
-                os << "    Predecessor idx=";
+                os << "    Predecessor index=";
                 if (u == build.root) {
                     os << "ROOT";
                 } else if (u == build.anchored_root) {
                     os << "ANCHORED_ROOT";
                 } else {
-                    os << g[u].idx;
+                    os << g[u].index;
                 }
                 os << ": bounds [" << g[ie].minBound << ", ";
                 if (g[ie].maxBound == ROSE_BOUND_INF) {
@@ -442,20 +442,26 @@ void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
 
 static
 void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
-    auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED);
+    size_t historyRequired = build.calcHistoryRequired();
+    size_t longLitLengthThreshold =
+        calcLongLitThreshold(build, historyRequired);
+
+    auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED,
+                                       longLitLengthThreshold);
     dumpTestLiterals(base + "rose_anchored_test_literals.txt", lits);
 
-    lits = fillHamsterLiteralList(build, ROSE_FLOATING);
+    lits = fillHamsterLiteralList(build, ROSE_FLOATING, longLitLengthThreshold);
     dumpTestLiterals(base + "rose_float_test_literals.txt", lits);
 
-    lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
+    lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED,
+                                  build.ematcher_region_size);
     dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
 
     if (!build.cc.streaming) {
         lits = fillHamsterLiteralList(build, ROSE_FLOATING,
-                                      ROSE_SMALL_BLOCK_LEN);
+                                    ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
         auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
-                                            ROSE_SMALL_BLOCK_LEN);
+                                    ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
         lits.insert(end(lits), begin(lits2), end(lits2));
         dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
     }
diff --git a/src/rose/rose_build_engine_blob.h b/src/rose/rose_build_engine_blob.h
new file mode 100644
index 00000000..8542b87b
--- /dev/null
+++ b/src/rose/rose_build_engine_blob.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_ENGINE_BLOB_H
+#define ROSE_BUILD_ENGINE_BLOB_H
+
+#include "rose_internal.h"
+
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/container.h"
+#include "util/multibit_build.h"
+#include "util/ue2_containers.h"
+#include "util/verify_types.h"
+
+#include <vector>
+#include <type_traits>
+
+#include <boost/core/noncopyable.hpp>
+
+namespace ue2 {
+
+class RoseEngineBlob : boost::noncopyable {
+public:
+    /** \brief Base offset of engine_blob in the Rose engine bytecode. */
+    static constexpr u32 base_offset = ROUNDUP_CL(sizeof(RoseEngine));
+
+    bool empty() const {
+        return blob.empty();
+    }
+
+    size_t size() const {
+        return blob.size();
+    }
+
+    const char *data() const {
+        return blob.data();
+    }
+
+    u32 add(const void *a, const size_t len, const size_t align) {
+        pad(align);
+
+        size_t rv = base_offset + blob.size();
+        assert(rv >= base_offset);
+        DEBUG_PRINTF("write %zu bytes at offset %zu\n", len, rv);
+
+        assert(ISALIGNED_N(blob.size(), align));
+
+        blob.resize(blob.size() + len);
+        memcpy(&blob.back() - len + 1, a, len);
+
+        return verify_u32(rv);
+    }
+
+    template<typename T>
+    u32 add(const T &a) {
+        static_assert(std::is_pod<T>::value, "should be pod");
+        return add(&a, sizeof(a), alignof(T));
+    }
+
+    template<typename T>
+    u32 add(const T &a, const size_t len) {
+        static_assert(std::is_pod<T>::value, "should be pod");
+        return add(&a, len, alignof(T));
+    }
+
+    template<typename Iter>
+    u32 add(Iter b, const Iter &e) {
+        using value_type = typename std::iterator_traits<Iter>::value_type;
+        static_assert(std::is_pod<value_type>::value, "should be pod");
+
+        if (b == e) {
+            return 0;
+        }
+
+        u32 offset = add(*b);
+        for (++b; b != e; ++b) {
+            add(*b);
+        }
+
+        return offset;
+    }
+
+    u32 add_iterator(const std::vector<mmbit_sparse_iter> &iter) {
+        auto cache_it = cached_iters.find(iter);
+        if (cache_it != cached_iters.end()) {
+            u32 offset = cache_it->second;
+            DEBUG_PRINTF("cache hit for iter at %u\n", offset);
+            return offset;
+        }
+
+        u32 offset = add(iter.begin(), iter.end());
+        cached_iters.emplace(iter, offset);
+        return offset;
+    }
+
+    void write_bytes(RoseEngine *engine) {
+        copy_bytes((char *)engine + base_offset, blob);
+    }
+
+private:
+    void pad(size_t align) {
+        assert(ISALIGNED_N(base_offset, align));
+        size_t s = blob.size();
+
+        if (ISALIGNED_N(s, align)) {
+            return;
+        }
+
+        blob.resize(s + align - s % align);
+    }
+
+    /** \brief Cache of previously-written sparse iterators. */
+    unordered_map<std::vector<mmbit_sparse_iter>, u32> cached_iters;
+
+    /**
+     * \brief Contents of the Rose bytecode immediately following the
+     * RoseEngine.
+     */
+    std::vector<char, AlignedAllocator<char, 64>> blob;
+};
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_ENGINE_BLOB_H
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
index c9e8d215..e91cc297 100644
--- a/src/rose/rose_build_exclusive.cpp
+++ b/src/rose/rose_build_exclusive.cpp
@@ -306,12 +306,12 @@ void findCliques(const map<u32, set<u32>> &exclusiveGroups,
     // Find clique groups
     const auto &clique = removeClique(*cg);
     for (const auto &i : clique) {
-        DEBUG_PRINTF("cliq:%lu\n", i.size());
+        DEBUG_PRINTF("cliq:%zu\n", i.size());
         if (i.size() > 1) {
             exclusive_roles.push_back(i);
         }
     }
-    DEBUG_PRINTF("Clique graph size:%lu\n", exclusive_roles.size());
+    DEBUG_PRINTF("Clique graph size:%zu\n", exclusive_roles.size());
 }
 
 static
@@ -326,7 +326,7 @@ map<u32, set<u32>> findExclusiveGroups(const RoseBuildImpl &build,
         set<u32> group;
         set<RoseVertex> q1(vertex_map.at(i).begin(),
                            vertex_map.at(i).end());
-        DEBUG_PRINTF("vertex set:%lu\n", q1.size());
+        DEBUG_PRINTF("vertex set:%zu\n", q1.size());
         for (const auto &val : s) {
             set<RoseVertex> q2(vertex_map.at(val).begin(),
                                vertex_map.at(val).end());
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 5e477e3b..0a1c501f 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -136,7 +136,7 @@ rose_group calcLocalGroup(const RoseVertex v, const RoseGraph &g,
                 }
             } else {
                 DEBUG_PRINTF("not sibling different mother %zu %zu\n",
-                             g[v].idx, g[w].idx);
+                             g[v].index, g[w].index);
             }
         }
     }
@@ -382,7 +382,7 @@ void assignGroupsToRoles(RoseBuildImpl &build) {
             g[ghost_it->second].groups |= succ_groups;
         }
 
-        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
+        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].index, g[v].groups);
     }
 }
 
@@ -397,8 +397,7 @@ getVertexGroupMap(const RoseBuildImpl &build) {
     vector<RoseVertex> v_order;
     v_order.reserve(num_vertices(g));
 
-    boost::topological_sort(g, back_inserter(v_order),
-                            vertex_index_map(get(&RoseVertexProps::idx, g)));
+    boost::topological_sort(g, back_inserter(v_order));
 
     unordered_map<RoseVertex, rose_group> vertex_group_map;
     vertex_group_map.reserve(num_vertices(g));
@@ -406,7 +405,7 @@ getVertexGroupMap(const RoseBuildImpl &build) {
     const rose_group initial_groups = build.getInitialGroups();
 
     for (const auto &v : boost::adaptors::reverse(v_order)) {
-        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
 
         if (build.isAnyStart(v)) {
             DEBUG_PRINTF("start vertex, groups=0x%llx\n", initial_groups);
@@ -419,7 +418,7 @@ getVertexGroupMap(const RoseBuildImpl &build) {
         assert(in_degree(v, g) > 0);
         rose_group pred_groups = ~rose_group{0};
         for (auto u : inv_adjacent_vertices_range(v, g)) {
-            DEBUG_PRINTF("pred %zu\n", g[u].idx);
+            DEBUG_PRINTF("pred %zu\n", g[u].index);
             assert(contains(vertex_group_map, u));
             pred_groups &= vertex_group_map.at(u);
         }
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index d239a698..6b326d34 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -56,6 +56,8 @@ namespace ue2 {
 
 #define ROSE_GROUPS_MAX 64
 
+#define ROSE_LONG_LITERAL_THRESHOLD_MIN 33
+
 struct BoundaryReports;
 struct CastleProto;
 struct CompileContext;
@@ -525,8 +527,6 @@ public:
     // max overlap considered for every pair (ulit, vlit).
     size_t maxLiteralOverlap(RoseVertex u, RoseVertex v) const;
 
-    void renumberVertices(void);
-
     bool isPseudoStar(const RoseEdge &e) const;
     bool isPseudoStarOrFirstOnly(const RoseEdge &e) const;
     bool hasOnlyPseudoStarInEdges(RoseVertex v) const;
@@ -549,7 +549,6 @@ public:
     const RoseVertex anchored_root;
     RoseLiteralMap literals;
     std::map<RoseVertex, RoseVertex> ghost;
-    size_t vertexIndex;
     ReportID getNewNfaReport() override {
         return next_nfa_report++;
     }
@@ -603,6 +602,9 @@ private:
     ReportID next_nfa_report;
 };
 
+size_t calcLongLitThreshold(const RoseBuildImpl &build,
+                            const size_t historyRequired);
+
 // Free functions, in rose_build_misc.cpp
 
 bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v);
@@ -615,7 +617,8 @@ ue2_literal findNonOverlappingTail(const std::set<ue2_literal> &lits,
 void setReportId(NGHolder &g, ReportID id);
 
 #ifndef NDEBUG
-bool roseHasTops(const RoseGraph &g, RoseVertex v);
+bool roseHasTops(const RoseBuildImpl &build, RoseVertex v);
+bool hasOrphanedTops(const RoseBuildImpl &build);
 #endif
 
 u64a findMaxOffset(const std::set<ReportID> &reports, const ReportManager &rm);
diff --git a/src/rose/rose_build_infix.cpp b/src/rose/rose_build_infix.cpp
index e81a7b00..4bbb3525 100644
--- a/src/rose/rose_build_infix.cpp
+++ b/src/rose/rose_build_infix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -108,14 +108,9 @@ void contractVertex(NGHolder &g, NFAVertex v,
 }
 
 static
-u32 findMaxInfixMatches(const NGHolder &h, const set<ue2_literal> &lits) {
+u32 findMaxLiteralMatches(const NGHolder &h, const set<ue2_literal> &lits) {
     DEBUG_PRINTF("h=%p, %zu literals\n", &h, lits.size());
-    //dumpGraph("infix.dot", h.g);
-
-    if (!onlyOneTop(h)) {
-        DEBUG_PRINTF("more than one top!n");
-        return NO_MATCH_LIMIT;
-    }
+    //dumpGraph("infix.dot", h);
 
     // Indices of vertices that could terminate any of the literals in 'lits'.
     set<u32> terms;
@@ -168,7 +163,7 @@ u32 findMaxInfixMatches(const NGHolder &h, const set<ue2_literal> &lits) {
     }
 
     remove_vertices(dead, g);
-    //dumpGraph("relaxed.dot", g.g);
+    //dumpGraph("relaxed.dot", g);
 
     depth maxWidth = findMaxWidth(g);
     DEBUG_PRINTF("maxWidth=%s\n", maxWidth.str().c_str());
@@ -262,7 +257,11 @@ u32 findMaxInfixMatches(const left_id &left, const set<ue2_literal> &lits) {
         return findMaxInfixMatches(*left.castle(), lits);
     }
     if (left.graph()) {
-        return findMaxInfixMatches(*left.graph(), lits);
+        if (!onlyOneTop(*left.graph())) {
+            DEBUG_PRINTF("more than one top!n");
+            return NO_MATCH_LIMIT;
+        }
+        return findMaxLiteralMatches(*left.graph(), lits);
     }
 
     return NO_MATCH_LIMIT;
@@ -279,7 +278,7 @@ void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable,
 
     const NGHolder &g = *left.graph();
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
     if (!proper_out_degree(g.startDs, g)) {
         cyclics.erase(g.startDs);
@@ -287,7 +286,7 @@ void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable,
 
     CharReach cyclic_cr;
     for (NFAVertex v : cyclics) {
-        DEBUG_PRINTF("considering %u ||=%zu\n", g[v].index,
+        DEBUG_PRINTF("considering %zu ||=%zu\n", g[v].index,
                       g[v].char_reach.count());
         cyclic_cr |= g[v].char_reach;
     }
@@ -315,7 +314,7 @@ void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable,
         lits.insert(ue2_literal(c, false));
     }
 
-    u32 count = findMaxInfixMatches(*left.graph(), lits);
+    u32 count = findMaxLiteralMatches(*left.graph(), lits);
     DEBUG_PRINTF("counting miracle %u\n", count + 1);
     if (count && count < 50) {
         *cm_count = count + 1;
diff --git a/src/rose/rose_build_long_lit.cpp b/src/rose/rose_build_long_lit.cpp
new file mode 100644
index 00000000..c32f49d0
--- /dev/null
+++ b/src/rose/rose_build_long_lit.cpp
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_long_lit.h"
+
+#include "rose_build_engine_blob.h"
+#include "rose_build_impl.h"
+#include "stream_long_lit_hash.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/verify_types.h"
+#include "util/compile_context.h"
+
+#include <algorithm>
+#include <numeric>
+
+using namespace std;
+
+namespace ue2 {
+
+/** \brief Minimum size for a non-empty hash table. Must be a power of two. */
+static constexpr u32 MIN_HASH_TABLE_SIZE = 128;
+
+/** \brief Maximum load factor (between zero and one) for a hash table. */
+static constexpr double MAX_HASH_TABLE_LOAD = 0.7;
+
+/** \brief Minimum size (in bits) for a bloom filter. Must be a power of two. */
+static constexpr u32 MIN_BLOOM_FILTER_SIZE = 256;
+
+/** \brief Maximum load factor (between zero and one) for a bloom filter. */
+static constexpr double MAX_BLOOM_FILTER_LOAD = 0.25;
+
+struct LongLitModeInfo {
+    u32 num_literals = 0; //!< Number of strings for this mode.
+    u32 hashed_positions = 0; //!< Number of hashable string positions.
+};
+
+struct LongLitInfo {
+    LongLitModeInfo caseful;
+    LongLitModeInfo nocase;
+};
+
+static
+u32 roundUpToPowerOfTwo(u32 x) {
+    assert(x != 0);
+    u32 bits = lg2(x - 1) + 1;
+    assert(bits < 32);
+    return 1U << bits;
+}
+
+static
+LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits,
+                            size_t max_len) {
+    LongLitInfo info;
+
+    for (const auto &lit : lits) {
+        auto &lit_info = lit.nocase ? info.nocase : info.caseful;
+        assert(lit.s.size() > max_len);
+        lit_info.num_literals++;
+        lit_info.hashed_positions += lit.s.size() - max_len;
+    }
+
+    DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions);
+    DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions);
+
+    return info;
+}
+
+static
+void addToBloomFilter(vector<u8> &bloom, const u8 *substr, bool nocase) {
+    const u32 num_keys = verify_u32(bloom.size() * 8);
+    const u32 key_mask = (1U << lg2(num_keys)) -1;
+
+    const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 };
+    for (const auto &hash_func : hash_functions) {
+        u32 hash = hash_func(substr, nocase);
+        u32 key = hash & key_mask;
+        DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * 8);
+        bloom[key / 8] |= 1U << (key % 8);
+    }
+}
+
+static
+size_t bloomOccupancy(const vector<u8> &bloom) {
+    return accumulate(begin(bloom), end(bloom), 0,
+                      [](const size_t &sum, const u8 &elem) {
+                          return sum + popcount32(elem);
+                      });
+}
+
+static
+double bloomLoad(const vector<u8> &bloom) {
+    return (double)bloomOccupancy(bloom) / (double)(bloom.size() * 8);
+}
+
+static
+vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len,
+                            size_t num_entries, bool nocase) {
+    assert(num_entries % 8 == 0);
+    assert((num_entries & (num_entries - 1)) == 0); // Must be power of two.
+
+    vector<u8> bloom(num_entries / 8, 0);
+
+    if (!num_entries) {
+        return bloom;
+    }
+
+    for (const auto &lit : lits) {
+        if (nocase != lit.nocase) {
+            continue;
+        }
+        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
+            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
+            addToBloomFilter(bloom, substr, nocase);
+        }
+    }
+
+    DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", bloomOccupancy(bloom),
+                 num_entries);
+
+    return bloom;
+}
+
+
+static
+vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits,
+                           size_t max_len, bool nocase) {
+    vector<u8> bloom;
+
+    size_t num_entries = MIN_BLOOM_FILTER_SIZE;
+    for (;;) {
+        bloom = buildBloomFilter(lits, max_len, num_entries, nocase);
+        DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     bloomLoad(bloom));
+        if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return bloom;
+}
+
+static
+size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) {
+    return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) {
+        return ent.str_offset != 0;
+    });
+}
+
+static
+double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) {
+    return (double)hashTableOccupancy(tab) / (double)(tab.size());
+}
+
+static
+vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
+                                            size_t max_len,
+                                            const vector<u32> &litToOffsetVal,
+                                            size_t numEntries, bool nocase) {
+    vector<RoseLongLitHashEntry> tab(numEntries, {0,0});
+
+    if (!numEntries) {
+        return tab;
+    }
+
+    map<u32, vector<pair<u32, u32>>> hashToLitOffPairs;
+
+    for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) {
+        const ue2_case_string &lit = lits[lit_id];
+        if (nocase != lit.nocase) {
+            continue;
+        }
+        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
+            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
+            u32 hash = hashLongLiteral(substr, max_len, lit.nocase);
+            hashToLitOffPairs[hash].emplace_back(lit_id, offset);
+        }
+    }
+
+    for (auto &m : hashToLitOffPairs) {
+        u32 hash = m.first;
+        vector<pair<u32, u32>> &d = m.second;
+
+        // Sort by (offset, string) so that we'll be able to remove identical
+        // string prefixes.
+        stable_sort(begin(d), end(d),
+                    [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                        const auto &str_a = lits[a.first].s;
+                        const auto &str_b = lits[b.first].s;
+                        return tie(a.second, str_a) < tie(b.second, str_b);
+                    });
+
+        // Remove entries that point to the same literal prefix.
+        d.erase(unique(begin(d), end(d),
+                       [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                           if (a.second != b.second) {
+                               return false;
+                           }
+                           const auto &str_a = lits[a.first].s;
+                           const auto &str_b = lits[b.first].s;
+                           const size_t len = max_len + a.second;
+                           return equal(begin(str_a), begin(str_a) + len,
+                                        begin(str_b));
+                       }),
+                end(d));
+
+        // Sort d by distance of the residual string (len minus our depth into
+        // the string). We need to put the 'furthest back' string first.
+        stable_sort(begin(d), end(d),
+                    [](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                        if (a.second != b.second) {
+                            return a.second > b.second; /* longest is first */
+                        }
+                        return a.first < b.first;
+                    });
+
+        u32 bucket = hash % numEntries;
+
+        // Placement via linear probing.
+        for (const auto &lit_offset : d) {
+            while (tab[bucket].str_offset != 0) {
+                bucket++;
+                if (bucket == numEntries) {
+                    bucket = 0;
+                }
+            }
+
+            u32 lit_id = lit_offset.first;
+            u32 offset = lit_offset.second;
+
+            DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
+                         lit_id, offset, bucket);
+
+            auto &entry = tab[bucket];
+            entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
+            assert(entry.str_offset != 0);
+            entry.str_len = offset + max_len;
+        }
+    }
+
+    DEBUG_PRINTF("%s hash table occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", hashTableOccupancy(tab),
+                 numEntries);
+
+    return tab;
+}
+
+static
+vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits,
+                                           size_t max_len,
+                                           const vector<u32> &litToOffsetVal,
+                                           u32 numPositions, bool nocase) {
+    vector<RoseLongLitHashEntry> tab;
+
+    // Note: for the hash table, we must always have at least enough entries
+    // for the number of hashable positions.
+    size_t num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE,
+    numPositions));
+
+    for (;;) {
+        tab = buildHashTable(lits, max_len, litToOffsetVal, num_entries,
+                             nocase);
+        DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     hashTableLoad(tab));
+        if (hashTableLoad(tab) < MAX_HASH_TABLE_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return tab;
+}
+
+static
+vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset,
+                     vector<u32> &litToOffsetVal) {
+    vector<u8> blob;
+    litToOffsetVal.resize(lits.size(), 0);
+
+    u32 lit_id = 0;
+    for (const auto &lit : lits) {
+        u32 offset = baseOffset + verify_u32(blob.size());
+        blob.insert(blob.end(), begin(lit.s), end(lit.s));
+        litToOffsetVal[lit_id] = offset;
+        lit_id++;
+    }
+
+    DEBUG_PRINTF("built %zu bytes of strings\n", blob.size());
+    return blob;
+}
+
+u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
+                          vector<ue2_case_string> &lits,
+                          size_t longLitLengthThreshold,
+                          size_t *historyRequired,
+                          size_t *longLitStreamStateRequired) {
+    // Work in terms of history requirement (i.e. literal len - 1).
+    const size_t max_len = longLitLengthThreshold - 1;
+
+    // We should only be building the long literal hash table in streaming mode.
+    if (!build.cc.streaming) {
+        return 0;
+    }
+
+    if (lits.empty()) {
+        DEBUG_PRINTF("no long literals\n");
+        return 0;
+    }
+
+    // The last char of each literal is trimmed as we're not interested in full
+    // matches, only partial matches.
+    for (auto &lit : lits) {
+        assert(!lit.s.empty());
+        lit.s.pop_back();
+    }
+
+    // Sort by caseful/caseless and in lexicographical order.
+    stable_sort(begin(lits), end(lits), [](const ue2_case_string &a,
+                                           const ue2_case_string &b) {
+        if (a.nocase != b.nocase) {
+            return a.nocase < b.nocase;
+        }
+        return a.s < b.s;
+    });
+
+    // Find literals that are prefixes of other literals (including
+    // duplicates). Note that we iterate in reverse, since we want to retain
+    // only the longest string from a set of prefixes.
+    auto it = unique(lits.rbegin(), lits.rend(), [](const ue2_case_string &a,
+                                                    const ue2_case_string &b) {
+        return a.nocase == b.nocase && a.s.size() >= b.s.size() &&
+               equal(b.s.begin(), b.s.end(), a.s.begin());
+    });
+
+    // Erase dupes found by unique().
+    lits.erase(lits.begin(), it.base());
+
+    LongLitInfo info = analyzeLongLits(lits, max_len);
+
+    vector<u32> litToOffsetVal;
+    const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
+    vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal);
+
+    // Build caseful bloom filter and hash table.
+    vector<u8> bloom_case;
+    vector<RoseLongLitHashEntry> tab_case;
+    if (info.caseful.num_literals) {
+        bloom_case = makeBloomFilter(lits, max_len, false);
+        tab_case = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.caseful.hashed_positions, false);
+    }
+
+    // Build nocase bloom filter and hash table.
+    vector<u8> bloom_nocase;
+    vector<RoseLongLitHashEntry> tab_nocase;
+    if (info.nocase.num_literals) {
+        bloom_nocase = makeBloomFilter(lits, max_len, true);
+        tab_nocase = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.nocase.hashed_positions, true);
+    }
+
+    size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob));
+    size_t htOffsetCase = headerSize + wholeLitTabSize;
+    size_t htOffsetNocase = htOffsetCase + byte_length(tab_case);
+    size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase);
+    size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case);
+
+    size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase));
+
+    // need to add +2 to both of these to allow space for the actual largest
+    // value as well as handling the fact that we add one to the space when
+    // storing out a position to allow zero to mean "no stream state value"
+    u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + 2));
+    u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2));
+    u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8;
+
+    auto table = aligned_zmalloc_unique<char>(tabSize);
+    assert(table); // otherwise would have thrown std::bad_alloc
+
+    // Fill in the RoseLongLitTable header structure.
+    RoseLongLitTable *header = (RoseLongLitTable *)(table.get());
+    header->size = verify_u32(tabSize);
+    header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
+    header->caseful.hashOffset = verify_u32(htOffsetCase);
+    header->caseful.hashBits = lg2(tab_case.size());
+    header->caseful.streamStateBits = streamBitsCase;
+    header->caseful.bloomOffset = verify_u32(bloomOffsetCase);
+    header->caseful.bloomBits = lg2(bloom_case.size() * 8);
+    header->nocase.hashOffset = verify_u32(htOffsetNocase);
+    header->nocase.hashBits = lg2(tab_nocase.size());
+    header->nocase.streamStateBits = streamBitsNocase;
+    header->nocase.bloomOffset = verify_u32(bloomOffsetNocase);
+    header->nocase.bloomBits = lg2(bloom_nocase.size() * 8);
+    assert(tot_state_bytes < sizeof(u64a));
+    header->streamStateBytes = verify_u8(tot_state_bytes); // u8
+
+    // Copy in the literal strings, hash tables and bloom filters,
+    copy_bytes(table.get() + headerSize, lit_blob);
+    copy_bytes(table.get() + htOffsetCase, tab_case);
+    copy_bytes(table.get() + bloomOffsetCase, bloom_case);
+    copy_bytes(table.get() + htOffsetNocase, tab_nocase);
+    copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase);
+
+    DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize);
+    DEBUG_PRINTF("requires %zu bytes of history\n", max_len);
+    DEBUG_PRINTF("requires %u bytes of stream state\n", tot_state_bytes);
+
+    *historyRequired = max(*historyRequired, max_len);
+    *longLitStreamStateRequired = tot_state_bytes;
+
+    return blob.add(table.get(), tabSize, 16);
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_long_lit.h b/src/rose/rose_build_long_lit.h
new file mode 100644
index 00000000..a77b1b69
--- /dev/null
+++ b/src/rose/rose_build_long_lit.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_LONG_LIT_H
+#define ROSE_BUILD_LONG_LIT_H
+
+#include "ue2common.h"
+
+#include <vector>
+
+namespace ue2 {
+
+class RoseBuildImpl;
+class RoseEngineBlob;
+struct ue2_case_string;
+
+u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
+                          std::vector<ue2_case_string> &lits,
+                          size_t longLitLengthThreshold,
+                          size_t *historyRequired,
+                          size_t *longLitStreamStateRequired);
+
+} // namespace ue2
+
+
+#endif // ROSE_BUILD_LONG_LIT_H
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index ba77b402..10bd59de 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,7 +72,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
         if (v == g.startDs) {
             continue;
         }
-        if (g[e].top == top) {
+        if (contains(g[e].tops, top)) {
             curr.insert(v);
         }
     }
@@ -261,7 +261,7 @@ void findForwardReach(const RoseGraph &g, const RoseVertex v,
     for (const auto &e : out_edges_range(v, g)) {
         RoseVertex t = target(e, g);
         if (!g[t].left) {
-            DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].idx);
+            DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index);
             return;
         }
         rose_look.push_back(map<s32, CharReach>());
@@ -460,17 +460,41 @@ void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
     }
 }
 
+static
+map<s32, CharReach> findLiteralReach(const rose_literal_id &lit) {
+    map<s32, CharReach> look;
+
+    u32 i = lit.delay + 1;
+    for (auto it = lit.s.rbegin(), ite = lit.s.rend(); it != ite; ++it) {
+        look[0 - i] |= *it;
+        i++;
+    }
+
+    return look;
+}
+
 static
 map<s32, CharReach> findLiteralReach(const RoseBuildImpl &build,
                                      const RoseVertex v) {
+    bool first = true;
     map<s32, CharReach> look;
     for (u32 lit_id : build.g[v].literals) {
         const rose_literal_id &lit = build.literals.right.at(lit_id);
+        auto lit_look = findLiteralReach(lit);
 
-        u32 i = lit.delay + 1;
-        for (auto it = lit.s.rbegin(), ite = lit.s.rend(); it != ite; ++it) {
-            look[0 - i] |= *it;
-            i++;
+        if (first) {
+            look = move(lit_look);
+            first = false;
+        } else {
+            for (auto it = look.begin(); it != look.end();) {
+                auto l_it = lit_look.find(it->first);
+                if (l_it == lit_look.end()) {
+                    it = look.erase(it);
+                } else {
+                    it->second |= l_it->second;
+                    ++it;
+                }
+            }
         }
     }
 
@@ -585,7 +609,7 @@ bool getTransientPrefixReach(const NGHolder &g, u32 lag,
     NFAVertex v = *(inv_adjacent_vertices(g.accept, g).first);
     u32 i = lag + 1;
     while (v != g.startDs) {
-        DEBUG_PRINTF("i=%u, v=%u\n", i, g[v].index);
+        DEBUG_PRINTF("i=%u, v=%zu\n", i, g[v].index);
         if (is_special(v, g)) {
             DEBUG_PRINTF("special\n");
             return false;
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 2eb70f60..01633c06 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -102,7 +102,7 @@ bool maskFromLeftGraph(const LeftEngInfo &left, vector<u8> &msk,
         CharReach cr;
         for (NFAVertex v : curr) {
             const auto &v_cr = h[v].char_reach;
-            DEBUG_PRINTF("vertex %u, reach %s\n", h[v].index,
+            DEBUG_PRINTF("vertex %zu, reach %s\n", h[v].index,
                          describeClass(v_cr).c_str());
             cr |= v_cr;
             insert(&next, inv_adjacent_vertices(v, h));
@@ -438,45 +438,43 @@ static
 bool isNoRunsVertex(const RoseBuildImpl &build, RoseVertex u) {
     const RoseGraph &g = build.g;
     if (!g[u].isBoring()) {
-        DEBUG_PRINTF("u=%zu is not boring\n", g[u].idx);
+        DEBUG_PRINTF("u=%zu is not boring\n", g[u].index);
         return false;
     }
 
     if (!g[u].reports.empty()) {
-        DEBUG_PRINTF("u=%zu has accept\n", g[u].idx);
+        DEBUG_PRINTF("u=%zu has accept\n", g[u].index);
         return false;
     }
 
     /* TODO: handle non-root roles as well. It can't be that difficult... */
 
-    if (!in_degree_equal_to(u, g, 1)) {
-        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
+    if (in_degree(u, g) != 1) {
+        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].index);
         return false;
     }
 
-    RoseEdge e;
-    bool exists;
-    tie(e, exists) = edge_by_target(build.root, u, g);
+    RoseEdge e = edge(build.root, u, g);
 
-    if (!exists) {
-        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
+    if (!e) {
+        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].index);
         return false;
     }
 
     if (g[e].minBound != 0 || g[e].maxBound != ROSE_BOUND_INF) {
-        DEBUG_PRINTF("u=%zu has bounds from root\n", g[u].idx);
+        DEBUG_PRINTF("u=%zu has bounds from root\n", g[u].index);
         return false;
     }
 
     for (const auto &oe : out_edges_range(u, g)) {
         RoseVertex v = target(oe, g);
         if (g[oe].maxBound != ROSE_BOUND_INF) {
-            DEBUG_PRINTF("edge (%zu,%zu) has max bound\n", g[u].idx,
-                    g[target(oe, g)].idx);
+            DEBUG_PRINTF("edge (%zu,%zu) has max bound\n", g[u].index,
+                         g[v].index);
             return false;
         }
         if (g[v].left) {
-            DEBUG_PRINTF("v=%zu has rose prefix\n", g[v].idx);
+            DEBUG_PRINTF("v=%zu has rose prefix\n", g[v].index);
             return false;
         }
     }
@@ -485,7 +483,7 @@ bool isNoRunsVertex(const RoseBuildImpl &build, RoseVertex u) {
 
 static
 bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
-                     const rose_literal_info &info) {
+                     const rose_literal_info &info, const size_t max_len) {
     DEBUG_PRINTF("lit id %u\n", id);
 
     if (info.requires_benefits) {
@@ -493,6 +491,11 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
         return false;
     }
 
+    if (build.literals.right.at(id).s.length() > max_len) {
+        DEBUG_PRINTF("requires literal check\n");
+        return false;
+    }
+
     if (isDirectHighlander(build, id, info)) {
         DEBUG_PRINTF("highlander direct report\n");
         return true;
@@ -558,7 +561,7 @@ u64a literalMinReportOffset(const RoseBuildImpl &build,
     u64a lit_min_offset = UINT64_MAX;
 
     for (const auto &v : info.vertices) {
-        DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].idx, g[v].min_offset);
+        DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].index, g[v].min_offset);
 
         u64a vert_offset = g[v].min_offset;
 
@@ -625,7 +628,7 @@ u64a literalMinReportOffset(const RoseBuildImpl &build,
 
 vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
                                            rose_literal_table table,
-                                           u32 max_offset) {
+                                           size_t max_len, u32 max_offset) {
     vector<hwlmLiteral> lits;
 
     for (const auto &e : build.literals.right) {
@@ -663,10 +666,14 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
         const vector<u8> &msk = e.second.msk;
         const vector<u8> &cmp = e.second.cmp;
 
-        bool noruns = isNoRunsLiteral(build, id, info);
+        bool noruns = isNoRunsLiteral(build, id, info, max_len);
 
         if (info.requires_explode) {
             DEBUG_PRINTF("exploding lit\n");
+
+            // We do not require_explode for long literals.
+            assert(lit.length() <= max_len);
+
             case_iter cit = caseIterateBegin(lit);
             case_iter cite = caseIterateEnd();
             for (; cit != cite; ++cit) {
@@ -687,20 +694,28 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
                                   msk, cmp);
             }
         } else {
-            const std::string &s = lit.get_string();
-            const bool nocase = lit.any_nocase();
+            string s = lit.get_string();
+            bool nocase = lit.any_nocase();
 
             DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
                          "cmp=%s\n",
                          final_id, escapeString(s).c_str(), (int)nocase, noruns,
                          dumpMask(msk).c_str(), dumpMask(cmp).c_str());
 
+            if (s.length() > max_len) {
+                DEBUG_PRINTF("truncating to tail of length %zu\n", max_len);
+                s.erase(0, s.length() - max_len);
+                // We shouldn't have set a threshold below 8 chars.
+                assert(msk.size() <= max_len);
+            }
+
             if (!maskIsConsistent(s, nocase, msk, cmp)) {
                 DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
                 continue;
             }
 
-            lits.emplace_back(s, nocase, noruns, final_id, groups, msk, cmp);
+            lits.emplace_back(move(s), nocase, noruns, final_id, groups, msk,
+                              cmp);
         }
     }
 
@@ -708,14 +723,15 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 }
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              size_t longLitLengthThreshold,
                                               rose_group *fgroups,
                                               size_t *fsize,
-                                              size_t *historyRequired,
-                                              size_t *streamStateRequired) {
+                                              size_t *historyRequired) {
     *fsize = 0;
     *fgroups = 0;
 
-    auto fl = fillHamsterLiteralList(build, ROSE_FLOATING);
+    auto fl = fillHamsterLiteralList(build, ROSE_FLOATING,
+                                     longLitLengthThreshold);
     if (fl.empty()) {
         DEBUG_PRINTF("empty floating matcher\n");
         return nullptr;
@@ -747,13 +763,10 @@ aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
     if (build.cc.streaming) {
         DEBUG_PRINTF("literal_history_required=%zu\n",
                 ctl.literal_history_required);
-        DEBUG_PRINTF("literal_stream_state_required=%zu\n",
-                ctl.literal_stream_state_required);
         assert(ctl.literal_history_required <=
                build.cc.grey.maxHistoryAvailable);
         *historyRequired = max(*historyRequired,
                 ctl.literal_history_required);
-        *streamStateRequired = ctl.literal_stream_state_required;
     }
 
     *fsize = hwlmSize(ftable.get());
@@ -778,8 +791,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING,
-                                       ROSE_SMALL_BLOCK_LEN);
+    auto lits = fillHamsterLiteralList(
+        build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
     if (lits.empty()) {
         DEBUG_PRINTF("no floating table\n");
         return nullptr;
@@ -788,8 +801,9 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto anchored_lits = fillHamsterLiteralList(build,
-                            ROSE_ANCHORED_SMALL_BLOCK, ROSE_SMALL_BLOCK_LEN);
+    auto anchored_lits =
+        fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
+                               ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
     if (anchored_lits.empty()) {
         DEBUG_PRINTF("no small-block anchored literals\n");
         return nullptr;
@@ -823,7 +837,8 @@ aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
                                                  size_t *esize) {
     *esize = 0;
 
-    auto el = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
+    auto el = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED,
+                                     build.ematcher_region_size);
 
     if (el.empty()) {
         DEBUG_PRINTF("no eod anchored literals\n");
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 2a225bf5..a25dbca3 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -51,13 +51,14 @@ struct hwlmLiteral;
  * only lead to a pattern match after max_offset may be excluded.
  */
 std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                    rose_literal_table table, u32 max_offset = ROSE_BOUND_INF);
+                    rose_literal_table table, size_t max_len,
+                    u32 max_offset = ROSE_BOUND_INF);
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              size_t longLitLengthThreshold,
                                               rose_group *fgroups,
                                               size_t *fsize,
-                                              size_t *historyRequired,
-                                              size_t *streamStateRequired);
+                                              size_t *historyRequired);
 
 aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
                                                 size_t *sbsize);
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 759e0dbe..54a7390e 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -53,7 +53,6 @@
 #include "nfagraph/ng_redundancy.h"
 #include "nfagraph/ng_repeat.h"
 #include "nfagraph/ng_reports.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_stop.h"
 #include "nfagraph/ng_uncalc_components.h"
 #include "nfagraph/ng_util.h"
@@ -207,8 +206,9 @@ void mergeDupeLeaves(RoseBuildImpl &tbi) {
             continue;
         }
 
-        DEBUG_PRINTF("inspecting vertex idx=%zu in_degree %zu out_degree %zu\n",
-                     g[v].idx, in_degree(v, g), out_degree(v, g));
+        DEBUG_PRINTF("inspecting vertex index=%zu in_degree %zu "
+                     "out_degree %zu\n", g[v].index, in_degree(v, g),
+                     out_degree(v, g));
 
         // Vertex must be a reporting leaf node
         if (g[v].reports.empty() || !isLeafNode(v, g)) {
@@ -228,24 +228,22 @@ void mergeDupeLeaves(RoseBuildImpl &tbi) {
         }
 
         RoseVertex t = leaves.find(dupe)->second;
-        DEBUG_PRINTF("found two leaf dupe roles, idx=%zu,%zu\n", g[v].idx,
-                     g[t].idx);
+        DEBUG_PRINTF("found two leaf dupe roles, index=%zu,%zu\n", g[v].index,
+                     g[t].index);
 
         vector<RoseEdge> deadEdges;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
-            DEBUG_PRINTF("u idx=%zu\n", g[u].idx);
-            RoseEdge et;
-            bool exists;
-            tie (et, exists) = edge(u, t, g);
-            if (exists) {
+            DEBUG_PRINTF("u index=%zu\n", g[u].index);
+            if (RoseEdge et = edge(u, t, g)) {
                 if (g[et].minBound <= g[e].minBound
                     && g[et].maxBound >= g[e].maxBound) {
                     DEBUG_PRINTF("remove more constrained edge\n");
                     deadEdges.push_back(e);
                 }
             } else {
-                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].idx, g[t].idx);
+                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].index,
+                             g[t].index);
                 add_edge(u, t, g[e], g);
                 deadEdges.push_back(e);
             }
@@ -280,7 +278,7 @@ void mergeDupeLeaves(RoseBuildImpl &tbi) {
 
     // if we've removed anything, we need to renumber vertices
     if (countRemovals) {
-        tbi.renumberVertices();
+        renumber_vertices(g);
         DEBUG_PRINTF("removed %zu vertices.\n", countRemovals);
     }
 }
@@ -313,8 +311,7 @@ void mergeCluster(RoseGraph &g, const ReportManager &rm,
         it = it2;
 
         DEBUG_PRINTF("merging cluster %zu\n", cluster.size());
-        map<NGHolder *, NGHolder *> merged;
-        mergeNfaCluster(cluster, &rm, merged, cc);
+        auto merged = mergeNfaCluster(cluster, &rm, cc);
         DEBUG_PRINTF("done\n");
 
         for (const auto &m : merged) {
@@ -351,7 +348,7 @@ void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
 
             // Ref count all suffixes, as we don't want to merge a suffix
             // that happens to be shared with a non-leaf vertex somewhere.
-            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].idx,
+            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].index,
                          g[v].suffix.graph.get());
             fcount[g[v].suffix.graph.get()]++;
 
@@ -460,7 +457,7 @@ struct RoseGroup {
         const RoseGraph &g = build.g;
         assert(in_degree(v, g) == 1);
         RoseVertex u = *inv_adjacent_vertices(v, g).first;
-        parent = g[u].idx;
+        parent = g[u].index;
     }
 
     bool operator<(const RoseGroup &b) const {
@@ -581,14 +578,14 @@ bool dedupeLeftfixes(RoseBuildImpl &tbi) {
             }
 
             // Scan the rest of the list for dupes.
-            for (auto kt = next(jt); kt != jte; ++kt) {
+            for (auto kt = std::next(jt); kt != jte; ++kt) {
                 if (g[v].left == g[*kt].left || !rosecmp(v, *kt)) {
                     continue;
                 }
 
                 // Dupe found.
                 DEBUG_PRINTF("rose at vertex %zu is a dupe of %zu\n",
-                             g[*kt].idx, g[v].idx);
+                             g[*kt].index, g[v].index);
                 assert(g[v].left.lag == g[*kt].left.lag);
                 g[*kt].left = g[v].left;
                 work_done = true;
@@ -1071,8 +1068,8 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
         return false;
     }
 
-    DEBUG_PRINTF("roses on %zu and %zu are mergeable\n", tbi.g[u].idx,
-                 tbi.g[v].idx);
+    DEBUG_PRINTF("roses on %zu and %zu are mergeable\n", tbi.g[u].index,
+                 tbi.g[v].index);
     return true;
 }
 
@@ -1388,7 +1385,7 @@ void processMergeQueue(RoseBuildImpl &tbi, RoseBouquet &roses,
 
 static
 bool nfaHasNarrowStart(const NGHolder &g) {
-    if (hasGreaterOutDegree(1, g.startDs, g)) {
+    if (out_degree(g.startDs, g) > 1) {
         return false; // unanchored
     }
 
@@ -1410,7 +1407,7 @@ bool nfaHasFiniteMaxWidth(const NGHolder &g) {
 
 namespace {
 struct RoseMergeKey {
-    RoseMergeKey(const RoseVertexSet &parents_in,
+    RoseMergeKey(const set<RoseVertex> &parents_in,
                  bool narrowStart_in, bool hasMaxWidth_in) :
                         narrowStart(narrowStart_in),
                         hasMaxWidth(hasMaxWidth_in),
@@ -1428,7 +1425,7 @@ struct RoseMergeKey {
     bool narrowStart;
     bool hasMaxWidth;
 
-    RoseVertexSet parents;
+    set<RoseVertex> parents;
 };
 }
 
@@ -1457,11 +1454,7 @@ bool hasReformedStartDotStar(const NGHolder &h, const Grey &grey) {
 static
 u32 commonPrefixLength(left_id &r1, left_id &r2) {
     if (r1.graph() && r2.graph()) {
-        auto &g1 = *r1.graph();
-        auto &g2 = *r2.graph();
-        auto state_ids_1 = numberStates(g1);
-        auto state_ids_2 = numberStates(g2);
-        return commonPrefixLength(g1, state_ids_1, g2, state_ids_2);
+        return commonPrefixLength(*r1.graph(), *r2.graph());
     } else if (r1.castle() && r2.castle()) {
         return min(findMinWidth(*r1.castle()), findMinWidth(*r2.castle()));
     }
@@ -1496,7 +1489,7 @@ void mergeLeftfixesVariableLag(RoseBuildImpl &tbi) {
 
     map<RoseMergeKey, RoseBouquet> rosesByParent;
     RoseGraph &g = tbi.g;
-    RoseVertexSet parents(g);
+    set<RoseVertex> parents;
 
     DEBUG_PRINTF("-----\n");
     DEBUG_PRINTF("entry\n");
@@ -1631,7 +1624,7 @@ struct DedupeLeftKey {
         : left_hash(hashLeftfix(build.g[v].left)) {
         const auto &g = build.g;
         for (const auto &e : in_edges_range(v, g)) {
-            preds.emplace(g[source(e, g)].idx, g[e].rose_top);
+            preds.emplace(g[source(e, g)].index, g[e].rose_top);
         }
     }
 
@@ -1731,7 +1724,7 @@ void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi) {
                 for (auto v : verts1) {
                     DEBUG_PRINTF("replacing report %u with %u on %zu\n",
                                  g[v].left.leftfix_report,
-                                 v2_left.leftfix_report, g[v].idx);
+                                 v2_left.leftfix_report, g[v].index);
                     u32 orig_lag = g[v].left.lag;
                     g[v].left = v2_left;
                     g[v].left.lag = orig_lag;
@@ -1750,7 +1743,6 @@ u32 findUnusedTop(const ue2::flat_set<u32> &tops) {
     while (contains(tops, i)) {
         i++;
     }
-    assert(i < NFA_MAX_TOP_MASKS);
     return i;
 }
 
@@ -1762,9 +1754,12 @@ void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
         if (v == h.startDs) {
             continue;
         }
-        DEBUG_PRINTF("vertex %u has top %u\n", h[v].index, h[e].top);
-        assert(contains(top_mapping, h[e].top));
-        h[e].top = top_mapping.at(h[e].top);
+        flat_set<u32> new_tops;
+        for (u32 t : h[e].tops) {
+            DEBUG_PRINTF("vertex %zu has top %u\n", h[v].index, t);
+            new_tops.insert(top_mapping.at(t));
+        }
+        h[e].tops = move(new_tops);
     }
 }
 
@@ -1776,11 +1771,6 @@ bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
     DEBUG_PRINTF("before: h1 has %zu tops, h2 has %zu tops\n", tops1.size(),
                  tops2.size());
 
-    if (tops1.size() + tops2.size() > NFA_MAX_TOP_MASKS) {
-        DEBUG_PRINTF("too many tops!\n");
-        return false;
-    }
-
     // If our tops don't intersect, we're OK to merge with no changes.
     if (!has_intersection(tops1, tops2)) {
         DEBUG_PRINTF("tops don't intersect\n");
@@ -1814,7 +1804,7 @@ bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
     }
 
     for (auto v : verts1) {
-        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
         assert(!g[v].left.haig);
         assert(!g[v].left.dfa);
         for (const auto &e : in_edges_range(v, g)) {
@@ -1823,7 +1813,7 @@ bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
             assert(contains(top_mapping, t));
             g[e].rose_top = top_mapping[t];
             DEBUG_PRINTF("edge (%zu,%zu) went from top %u to %u\n",
-                         g[source(e, g)].idx, g[target(e, g)].idx, t,
+                         g[source(e, g)].index, g[target(e, g)].index, t,
                          top_mapping[t]);
         }
     }
@@ -1844,7 +1834,7 @@ bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
     }
 
     for (auto v : verts1) {
-        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("vertex %zu\n", g[v].index);
         u32 t = g[v].suffix.top;
         assert(contains(top_mapping, t));
         g[v].suffix.top = top_mapping[t];
@@ -1853,11 +1843,6 @@ bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
     return true;
 }
 
-static
-bool hasMaxTops(const NGHolder &h) {
-    return getTops(h).size() == NFA_MAX_TOP_MASKS;
-}
-
 /** \brief Estimate the number of accel states in the given graph when built as
  * an NFA.
  *
@@ -1896,11 +1881,6 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
                          "with %p (%zu verts)\n",
                          r1.graph(), verts1.size(), r2.graph(), verts2.size());
 
-            if (hasMaxTops(*r1.graph())) {
-                DEBUG_PRINTF("h1 has hit max tops\n");
-                break; // next h1
-            }
-
             u32 accel1 = accel_count[r1];
             if (accel1 >= NFA_MAX_ACCEL_STATES) {
                 DEBUG_PRINTF("h1 has hit max accel\n");
@@ -2189,17 +2169,17 @@ void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
         suffix_id s1 = *it;
         const deque<RoseVertex> &verts1 = suffixes.vertices(s1);
         assert(s1.graph() && s1.graph()->kind == NFA_SUFFIX);
+
+        // Caller should ensure that we don't propose merges of graphs that are
+        // already too big.
+        assert(num_vertices(*s1.graph()) < small_merge_max_vertices(tbi.cc));
+
         deque<suffix_id> merged;
         for (auto jt = next(it); jt != suffixes.end(); ++jt) {
             suffix_id s2 = *jt;
             const deque<RoseVertex> &verts2 = suffixes.vertices(s2);
             assert(s2.graph() && s2.graph()->kind == NFA_SUFFIX);
 
-            if (hasMaxTops(*s1.graph())) {
-                DEBUG_PRINTF("h1 has hit max tops\n");
-                break; // next h1
-            }
-
             if (!acyclic) {
                 u32 accel1 = accel_count[s1];
                 if (accel1 >= NFA_MAX_ACCEL_STATES) {
@@ -2306,6 +2286,10 @@ void mergeAcyclicSuffixes(RoseBuildImpl &tbi) {
 
         assert(!g[v].suffix.haig);
 
+        if (num_vertices(*h) >= small_merge_max_vertices(tbi.cc)) {
+            continue;
+        }
+
         if (!isAcyclic(*h)) {
             continue;
         }
@@ -2429,7 +2413,8 @@ map<NGHolder *, NGHolder *> chunkedNfaMerge(RoseBuildImpl &build,
         batch.push_back(*it);
         assert((*it)->kind == NFA_OUTFIX);
         if (batch.size() == MERGE_GROUP_SIZE_MAX || next(it) == ite) {
-            mergeNfaCluster(batch, &build.rm, merged, build.cc);
+            auto batch_merged = mergeNfaCluster(batch, &build.rm, build.cc);
+            insert(&merged, batch_merged);
             batch.clear();
         }
     }
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index c2f9f580..28b885bd 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -75,7 +75,6 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
     : cc(cc_in),
       root(add_vertex(g)),
       anchored_root(add_vertex(g)),
-      vertexIndex(0),
       delay_base_id(MO_INVALID_IDX),
       hasSom(false),
       group_end(0),
@@ -89,11 +88,9 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
       boundary(boundary_in),
       next_nfa_report(0) {
     // add root vertices to graph
-    g[root].idx = vertexIndex++;
     g[root].min_offset = 0;
     g[root].max_offset = 0;
 
-    g[anchored_root].idx = vertexIndex++;
     g[anchored_root].min_offset = 0;
     g[anchored_root].max_offset = 0;
 }
@@ -193,7 +190,7 @@ bool RoseBuildImpl::hasLiteralInTable(RoseVertex v,
 bool RoseBuildImpl::hasNoFloatingRoots() const {
     for (auto v : adjacent_vertices_range(root, g)) {
         if (isFloating(v)) {
-            DEBUG_PRINTF("direct floating root %zu\n", g[v].idx);
+            DEBUG_PRINTF("direct floating root %zu\n", g[v].index);
             return false;
         }
     }
@@ -201,7 +198,7 @@ bool RoseBuildImpl::hasNoFloatingRoots() const {
     /* need to check if the anchored_root has any literals which are too deep */
     for (auto v : adjacent_vertices_range(anchored_root, g)) {
         if (isFloating(v)) {
-            DEBUG_PRINTF("indirect floating root %zu\n", g[v].idx);
+            DEBUG_PRINTF("indirect floating root %zu\n", g[v].index);
             return false;
         }
     }
@@ -337,14 +334,14 @@ size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const {
 void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) {
     for (auto v : dead) {
         assert(!isAnyStart(v));
-        DEBUG_PRINTF("removing vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("removing vertex %zu\n", g[v].index);
         for (auto lit_id : g[v].literals) {
             literal_info[lit_id].vertices.erase(v);
         }
-        clear_vertex_faster(v, g);
+        clear_vertex(v, g);
         remove_vertex(v, g);
     }
-    renumberVertices();
+    renumber_vertices(g);
 }
 
 // Find the maximum bound on the edges to this vertex's successors ignoring
@@ -893,7 +890,6 @@ bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) {
 // Note: only clones the vertex, you'll have to wire up your own edges.
 RoseVertex RoseBuildImpl::cloneVertex(RoseVertex v) {
     RoseVertex v2 = add_vertex(g[v], g);
-    g[v2].idx = vertexIndex++;
 
     for (const auto &lit_id : g[v2].literals) {
         literal_info[lit_id].vertices.insert(v2);
@@ -903,12 +899,15 @@ RoseVertex RoseBuildImpl::cloneVertex(RoseVertex v) {
 }
 
 #ifndef NDEBUG
-bool roseHasTops(const RoseGraph &g, RoseVertex v) {
+bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) {
+    const RoseGraph &g = build.g;
     assert(g[v].left);
 
     set<u32> graph_tops;
-    for (const auto &e : in_edges_range(v, g)) {
-        graph_tops.insert(g[e].rose_top);
+    if (!build.isRootSuccessor(v)) {
+        for (const auto &e : in_edges_range(v, g)) {
+            graph_tops.insert(g[e].rose_top);
+        }
     }
 
     return is_subset_of(graph_tops, all_tops(g[v].left));
@@ -1073,18 +1072,9 @@ bool has_non_eod_accepts(const suffix_id &s) {
 set<u32> all_tops(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
     if (s.graph()) {
-        set<u32> tops;
-        const NGHolder &h = *s.graph();
-        for (const auto &e : out_edges_range(h.start, h)) {
-            if (target(e, h) == h.startDs) {
-                continue;
-            }
-            tops.insert(h[e].top);
-        }
-        if (tops.empty()) {
-            tops.insert(0); // Vacuous graph, triggered on zero top.
-        }
-        return tops;
+        flat_set<u32> tops = getTops(*s.graph());
+        assert(!tops.empty());
+        return {tops.begin(), tops.end()};
     }
 
     if (s.castle()) {
@@ -1142,18 +1132,8 @@ depth findMaxWidth(const left_id &r) {
 set<u32> all_tops(const left_id &r) {
     assert(r.graph() || r.castle() || r.haig() || r.dfa());
     if (r.graph()) {
-        set<u32> tops;
-        const NGHolder &h = *r.graph();
-        for (const auto &e : out_edges_range(h.start, h)) {
-            if (target(e, h) == h.startDs) {
-                continue;
-            }
-            tops.insert(h[e].top);
-        }
-        if (tops.empty()) {
-            tops.insert(0); // Vacuous graph, triggered on zero top.
-        }
-        return tops;
+        flat_set<u32> tops = getTops(*r.graph());
+        return {tops.begin(), tops.end()};
     }
 
     if (r.castle()) {
@@ -1226,7 +1206,7 @@ u32 roseQuality(const RoseEngine *t) {
         }
         const NFA *nfa = (const NFA *)((const char *)atable + sizeof(*atable));
 
-        if (nfa->type != MCCLELLAN_NFA_8) {
+        if (!isSmallDfaType(nfa->type)) {
             DEBUG_PRINTF("m16 atable engine\n");
             return 0;
         }
@@ -1293,7 +1273,7 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
     // First, check the Rose leftfixes.
 
     for (auto v : vertices_range(g)) {
-        DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].index);
 
         if (g[v].left.castle) {
             DEBUG_PRINTF("castle ok\n");
@@ -1309,10 +1289,10 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
         }
         if (g[v].left.graph) {
             assert(g[v].left.graph->kind
-                   == tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX);
+                   == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));
             if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) {
-                DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n", g[v].idx,
-                             num_vertices(*g[v].left.graph));
+                DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n",
+                             g[v].index, num_vertices(*g[v].left.graph));
                 return false;
             }
         }
@@ -1321,7 +1301,7 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
     // Suffix graphs.
 
     for (auto v : vertices_range(g)) {
-        DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].idx);
+        DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].index);
 
         const RoseSuffixInfo &suffix = g[v].suffix;
         if (suffix.castle) {
@@ -1339,8 +1319,8 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
         if (suffix.graph) {
             assert(suffix.graph->kind == NFA_SUFFIX);
             if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) {
-                DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n", g[v].idx,
-                             num_vertices(*suffix.graph));
+                DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n",
+                             g[v].index, num_vertices(*suffix.graph));
                 return false;
             }
         }
@@ -1348,6 +1328,49 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
 
     return true;
 }
+
+bool hasOrphanedTops(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+
+    ue2::unordered_map<left_id, set<u32> > roses;
+    ue2::unordered_map<suffix_id, set<u32> > suffixes;
+
+    for (auto v : vertices_range(g)) {
+        if (g[v].left) {
+            set<u32> &tops = roses[g[v].left];
+            if (!build.isRootSuccessor(v)) {
+                // Tops for infixes come from the in-edges.
+                for (const auto &e : in_edges_range(v, g)) {
+                    tops.insert(g[e].rose_top);
+                }
+            }
+        }
+        if (g[v].suffix) {
+            suffixes[g[v].suffix].insert(g[v].suffix.top);
+        }
+    }
+
+    for (const auto &e : roses) {
+        if (all_tops(e.first) != e.second) {
+            DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n",
+                         as_string_list(all_tops(e.first)).c_str(),
+                         as_string_list(e.second).c_str());
+            return true;
+        }
+    }
+
+    for (const auto &e : suffixes) {
+        if (all_tops(e.first) != e.second) {
+            DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n",
+                         as_string_list(all_tops(e.first)).c_str(),
+                         as_string_list(e.second).c_str());
+            return true;
+        }
+    }
+
+    return false;
+}
+
 #endif // NDEBUG
 
 } // namespace ue2
diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp
new file mode 100644
index 00000000..ee237639
--- /dev/null
+++ b/src/rose/rose_build_program.cpp
@@ -0,0 +1,572 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_engine_blob.h"
+#include "rose_build_program.h"
+#include "util/container.h"
+#include "util/multibit_build.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cstring>
+
+using namespace std;
+
+namespace ue2 {
+
+/* Destructors to avoid weak vtables. */
+
+RoseInstruction::~RoseInstruction() = default;
+RoseInstrCatchUp::~RoseInstrCatchUp() = default;
+RoseInstrCatchUpMpv::~RoseInstrCatchUpMpv() = default;
+RoseInstrSomZero::~RoseInstrSomZero() = default;
+RoseInstrSuffixesEod::~RoseInstrSuffixesEod() = default;
+RoseInstrMatcherEod::~RoseInstrMatcherEod() = default;
+RoseInstrEnd::~RoseInstrEnd() = default;
+
+using OffsetMap = RoseInstruction::OffsetMap;
+
+static
+u32 calc_jump(const OffsetMap &offset_map, const RoseInstruction *from,
+              const RoseInstruction *to) {
+    DEBUG_PRINTF("computing relative jump from %p to %p\n", from, to);
+    assert(from && contains(offset_map, from));
+    assert(to && contains(offset_map, to));
+
+    u32 from_offset = offset_map.at(from);
+    u32 to_offset = offset_map.at(to);
+    DEBUG_PRINTF("offsets: %u -> %u\n", from_offset, to_offset);
+    assert(from_offset <= to_offset);
+
+    return to_offset - from_offset;
+}
+
+void RoseInstrAnchoredDelay::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+    inst->done_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckLitEarly::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->min_offset = min_offset;
+}
+
+void RoseInstrCheckGroups::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrCheckOnlyEod::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckBounds::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->min_bound = min_bound;
+    inst->max_bound = max_bound;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckNotHandled::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->key = key;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckSingleLookaround::write(void *dest, RoseEngineBlob &blob,
+                                           const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->offset = offset;
+    inst->reach_index = reach_index;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckLookaround::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->index = index;
+    inst->count = count;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMask::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->and_mask = and_mask;
+    inst->cmp_mask = cmp_mask;
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(and_mask), end(and_mask), inst->and_mask);
+    copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->and_mask = and_mask;
+    inst->cmp_mask = cmp_mask;
+    inst->negation = negation;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti16x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(nib_mask), end(nib_mask), inst->nib_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti32x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti16x16::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
+         inst->bucket_select_mask_lo);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+    inst->report = report;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckPrefix::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+    inst->report = report;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrPushDelayed::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->delay = delay;
+    inst->index = index;
+}
+
+void RoseInstrRecordAnchored::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->id = id;
+}
+
+void RoseInstrSomAdjust::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->distance = distance;
+}
+
+void RoseInstrSomLeftfix::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+}
+
+void RoseInstrSomFromReport::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrTriggerInfix::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->cancel = cancel;
+    inst->queue = queue;
+    inst->event = event;
+}
+
+void RoseInstrTriggerSuffix::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->event = event;
+}
+
+void RoseInstrDedupe::write(void *dest, RoseEngineBlob &blob,
+                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrDedupeSom::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrReportChain::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->event = event;
+    inst->top_squash_distance = top_squash_distance;
+}
+
+void RoseInstrReportSomInt::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrReportSomAware::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrReport::write(void *dest, RoseEngineBlob &blob,
+                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrReportExhaust::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->ekey = ekey;
+}
+
+void RoseInstrReportSom::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrReportSomExhaust::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->ekey = ekey;
+}
+
+void RoseInstrDedupeAndReport::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrFinalReport::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrCheckExhausted::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->ekey = ekey;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMinLength::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->end_adj = end_adj;
+    inst->min_length = min_length;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrSetState::write(void *dest, RoseEngineBlob &blob,
+                              const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->index = index;
+}
+
+void RoseInstrSetGroups::write(void *dest, RoseEngineBlob &blob,
+                              const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrSquashGroups::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrCheckState::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->index = index;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrSparseIterBegin::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Resolve and write the multibit sparse iterator and the jump table.
+    vector<u32> keys;
+    vector<u32> jump_offsets;
+    for (const auto &jump : jump_table) {
+        keys.push_back(jump.first);
+        assert(contains(offset_map, jump.second));
+        jump_offsets.push_back(offset_map.at(jump.second));
+    }
+
+    vector<mmbit_sparse_iter> iter;
+    mmbBuildSparseIterator(iter, keys, num_keys);
+    assert(!iter.empty());
+    inst->iter_offset = blob.add_iterator(iter);
+    inst->jump_table = blob.add(jump_offsets.begin(), jump_offsets.end());
+
+    // Store offsets for corresponding SPARSE_ITER_NEXT operations.
+    is_written = true;
+    iter_offset = inst->iter_offset;
+    jump_table_offset = inst->jump_table;
+}
+
+void RoseInstrSparseIterNext::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->state = state;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Use the same sparse iterator and jump table as the SPARSE_ITER_BEGIN
+    // instruction.
+    assert(begin);
+    assert(contains(offset_map, begin));
+    assert(begin->is_written);
+    inst->iter_offset = begin->iter_offset;
+    inst->jump_table = begin->jump_table_offset;
+}
+
+void RoseInstrSparseIterAny::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Write the multibit sparse iterator.
+    vector<mmbit_sparse_iter> iter;
+    mmbBuildSparseIterator(iter, keys, num_keys);
+    assert(!iter.empty());
+    inst->iter_offset = blob.add_iterator(iter);
+}
+
+void RoseInstrEnginesEod::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->iter_offset = iter_offset;
+}
+
+void RoseInstrCheckLongLit::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+}
+
+void RoseInstrCheckLongLitNocase::write(void *dest, RoseEngineBlob &blob,
+                                        const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+}
+
+static
+OffsetMap makeOffsetMap(const RoseProgram &program, u32 *total_len) {
+    OffsetMap offset_map;
+    u32 offset = 0;
+    for (const auto &ri : program) {
+        offset = ROUNDUP_N(offset, ROSE_INSTR_MIN_ALIGN);
+        DEBUG_PRINTF("instr %p (opcode %d) -> offset %u\n", ri.get(),
+                     ri->code(), offset);
+        assert(!contains(offset_map, ri.get()));
+        offset_map.emplace(ri.get(), offset);
+        offset += ri->byte_length();
+    }
+    *total_len = offset;
+    return offset_map;
+}
+
+aligned_unique_ptr<char>
+writeProgram(RoseEngineBlob &blob, const RoseProgram &program, u32 *total_len) {
+    const auto offset_map = makeOffsetMap(program, total_len);
+    DEBUG_PRINTF("%zu instructions, len %u\n", program.size(), *total_len);
+
+    auto bytecode = aligned_zmalloc_unique<char>(*total_len);
+    char *ptr = bytecode.get();
+
+    for (const auto &ri : program) {
+        assert(contains(offset_map, ri.get()));
+        const u32 offset = offset_map.at(ri.get());
+        ri->write(ptr + offset, blob, offset_map);
+    }
+
+    return bytecode;
+}
+
+bool RoseProgramEquivalence::operator()(const RoseProgram &prog1,
+                                        const RoseProgram &prog2) const {
+    if (prog1.size() != prog2.size()) {
+        return false;
+    }
+
+    u32 len_1 = 0, len_2 = 0;
+    const auto offset_map_1 = makeOffsetMap(prog1, &len_1);
+    const auto offset_map_2 = makeOffsetMap(prog2, &len_2);
+
+    if (len_1 != len_2) {
+        return false;
+    }
+
+    auto is_equiv = [&](const unique_ptr<RoseInstruction> &a,
+                        const unique_ptr<RoseInstruction> &b) {
+        assert(a && b);
+        return a->equiv(*b, offset_map_1, offset_map_2);
+    };
+
+    return std::equal(prog1.begin(), prog1.end(), prog2.begin(), is_equiv);
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h
new file mode 100644
index 00000000..0c725b46
--- /dev/null
+++ b/src/rose/rose_build_program.h
@@ -0,0 +1,1967 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_PROGRAM_H
+#define ROSE_BUILD_PROGRAM_H
+
+#include "rose_build_impl.h"
+#include "rose_program.h"
+#include "som/som_operation.h"
+#include "util/alloc.h"
+#include "util/container.h"
+#include "util/hash.h"
+#include "util/make_unique.h"
+#include "util/ue2_containers.h"
+#include "util/ue2string.h"
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+#include <boost/functional/hash/hash_fwd.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+namespace ue2 {
+
+class RoseEngineBlob;
+
+/**
+ * \brief Abstract base class representing a single Rose instruction.
+ */
+class RoseInstruction {
+public:
+    virtual ~RoseInstruction();
+
+    /** \brief Opcode used for the instruction in the bytecode. */
+    virtual RoseInstructionCode code() const = 0;
+
+    /**
+     * \brief Simple hash used for program equivalence.
+     *
+     * Note that pointers (jumps, for example) should not be used when
+     * calculating the hash: they will be converted to instruction offsets when
+     * compared later.
+     */
+    virtual size_t hash() const = 0;
+
+    /** \brief Length of the bytecode instruction in bytes. */
+    virtual size_t byte_length() const = 0;
+
+    using OffsetMap = unordered_map<const RoseInstruction *, u32>;
+
+    /**
+     * \brief Writes a concrete implementation of this instruction.
+     *
+     * Other data that this instruction depends on is written directly into the
+     * blob, while the instruction structure itself (of size given by
+     * the byte_length() function) is written to dest.
+     */
+    virtual void write(void *dest, RoseEngineBlob &blob,
+                       const OffsetMap &offset_map) const = 0;
+
+    /**
+     * \brief Update a target pointer.
+     *
+     * If this instruction contains any reference to the old target, replace it
+     * with the new one.
+     */
+    virtual void update_target(const RoseInstruction *old_target,
+                               const RoseInstruction *new_target) = 0;
+
+    /**
+     * \brief True if these instructions are equivalent within their own
+     * programs.
+     *
+     * Checks that any pointers to other instructions point to the same
+     * offsets.
+     */
+    bool equiv(const RoseInstruction &other, const OffsetMap &offsets,
+               const OffsetMap &other_offsets) const {
+        return equiv_impl(other, offsets, other_offsets);
+    }
+
+private:
+    virtual bool equiv_impl(const RoseInstruction &other,
+                            const OffsetMap &offsets,
+                            const OffsetMap &other_offsets) const = 0;
+};
+
+/**
+ * \brief Templated implementation class to handle boring boilerplate code.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBase : public RoseInstruction {
+protected:
+    static constexpr RoseInstructionCode opcode = Opcode;
+    using impl_type = ImplType;
+
+public:
+    RoseInstructionCode code() const override { return opcode; }
+
+    size_t byte_length() const override {
+        return sizeof(impl_type);
+    }
+
+    /**
+     * Note: this implementation simply zeroes the destination region and
+     * writes in the correct opcode. This is sufficient for trivial
+     * instructions, but instructions with data members will want to override
+     * it.
+     */
+    void write(void *dest, RoseEngineBlob &,
+               const RoseInstruction::OffsetMap &) const override {
+        assert(dest != nullptr);
+        assert(ISALIGNED_N(dest, ROSE_INSTR_MIN_ALIGN));
+
+        impl_type *inst = static_cast<impl_type *>(dest);
+        memset(inst, 0, sizeof(impl_type));
+        inst->code = verify_u8(opcode);
+    }
+
+private:
+    bool equiv_impl(const RoseInstruction &other, const OffsetMap &offsets,
+                    const OffsetMap &other_offsets) const override {
+        const auto *ri_that = dynamic_cast<const RoseInstrType *>(&other);
+        if (!ri_that) {
+            return false;
+        }
+        const auto *ri_this = dynamic_cast<const RoseInstrType *>(this);
+        assert(ri_this);
+        return ri_this->equiv_to(*ri_that, offsets, other_offsets);
+    }
+};
+
+/**
+ * \brief Refinement of RoseInstrBase to use for instructions that have
+ * just a single target member, called "target".
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseOneTarget
+    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
+public:
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        RoseInstrType *ri = dynamic_cast<RoseInstrType *>(this);
+        assert(ri);
+        if (ri->target == old_target) {
+            ri->target = new_target;
+        }
+    }
+};
+
+/**
+ * \brief Refinement of RoseInstrBase to use for instructions that have no
+ * targets.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseNoTargets
+    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
+public:
+    void update_target(const RoseInstruction *,
+                       const RoseInstruction *) override {}
+};
+
+/**
+ * \brief Refinement of RoseInstrBaseNoTargets to use for instructions that
+ * have no members at all, just an opcode.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseTrivial
+    : public RoseInstrBaseNoTargets<Opcode, ImplType, RoseInstrType> {
+public:
+    virtual bool operator==(const RoseInstrType &) const { return true; }
+
+    size_t hash() const override {
+        return boost::hash_value(static_cast<int>(Opcode));
+    }
+
+    bool equiv_to(const RoseInstrType &, const RoseInstruction::OffsetMap &,
+                  const RoseInstruction::OffsetMap &) const {
+        return true;
+    }
+};
+
+////
+//// Concrete implementation classes start here.
+////
+
+class RoseInstrAnchoredDelay
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_ANCHORED_DELAY,
+                                    ROSE_STRUCT_ANCHORED_DELAY,
+                                    RoseInstrAnchoredDelay> {
+public:
+    rose_group groups;
+    const RoseInstruction *target;
+
+    RoseInstrAnchoredDelay(rose_group groups_in,
+                           const RoseInstruction *target_in)
+        : groups(groups_in), target(target_in) {}
+
+    bool operator==(const RoseInstrAnchoredDelay &ri) const {
+        return groups == ri.groups && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrAnchoredDelay &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return groups == ri.groups &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckLitEarly
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LIT_EARLY,
+                                    ROSE_STRUCT_CHECK_LIT_EARLY,
+                                    RoseInstrCheckLitEarly> {
+public:
+    u32 min_offset;
+
+    explicit RoseInstrCheckLitEarly(u32 min) : min_offset(min) {}
+
+    bool operator==(const RoseInstrCheckLitEarly &ri) const {
+        return min_offset == ri.min_offset;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), min_offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLitEarly &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return min_offset == ri.min_offset;
+    }
+};
+
+class RoseInstrCheckGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_GROUPS,
+                                    ROSE_STRUCT_CHECK_GROUPS,
+                                    RoseInstrCheckGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrCheckGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrCheckGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrCheckOnlyEod
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_ONLY_EOD,
+                                    ROSE_STRUCT_CHECK_ONLY_EOD,
+                                    RoseInstrCheckOnlyEod> {
+public:
+    const RoseInstruction *target;
+
+    explicit RoseInstrCheckOnlyEod(const RoseInstruction *target_in)
+        : target(target_in) {}
+
+    bool operator==(const RoseInstrCheckOnlyEod &ri) const {
+        return target == ri.target;
+    }
+
+    size_t hash() const override {
+        return boost::hash_value(static_cast<int>(opcode));
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckOnlyEod &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckBounds
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BOUNDS,
+                                    ROSE_STRUCT_CHECK_BOUNDS,
+                                    RoseInstrCheckBounds> {
+public:
+    u64a min_bound;
+    u64a max_bound;
+    const RoseInstruction *target;
+
+    RoseInstrCheckBounds(u64a min, u64a max, const RoseInstruction *target_in)
+        : min_bound(min), max_bound(max), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckBounds &ri) const {
+        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), min_bound, max_bound);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckBounds &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckNotHandled
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_NOT_HANDLED,
+                                    ROSE_STRUCT_CHECK_NOT_HANDLED,
+                                    RoseInstrCheckNotHandled> {
+public:
+    u32 key;
+    const RoseInstruction *target;
+
+    RoseInstrCheckNotHandled(u32 key_in, const RoseInstruction *target_in)
+        : key(key_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckNotHandled &ri) const {
+        return key == ri.key && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), key);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckNotHandled &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return key == ri.key &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckSingleLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SINGLE_LOOKAROUND,
+                                    ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND,
+                                    RoseInstrCheckSingleLookaround> {
+public:
+    s8 offset;
+    u32 reach_index;
+    const RoseInstruction *target;
+
+    RoseInstrCheckSingleLookaround(s8 offset_in, u32 reach_index_in,
+                                   const RoseInstruction *target_in)
+        : offset(offset_in), reach_index(reach_index_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckSingleLookaround &ri) const {
+        return offset == ri.offset && reach_index == ri.reach_index &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), offset, reach_index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckSingleLookaround &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return offset == ri.offset && reach_index == ri.reach_index &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LOOKAROUND,
+                                    ROSE_STRUCT_CHECK_LOOKAROUND,
+                                    RoseInstrCheckLookaround> {
+public:
+    u32 index;
+    u32 count;
+    const RoseInstruction *target;
+
+    RoseInstrCheckLookaround(u32 index_in, u32 count_in,
+                             const RoseInstruction *target_in)
+        : index(index_in), count(count_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckLookaround &ri) const {
+        return index == ri.index && count == ri.count && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), index, count);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLookaround &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return index == ri.index && count == ri.count &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMask
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK,
+                                    ROSE_STRUCT_CHECK_MASK,
+                                    RoseInstrCheckMask> {
+public:
+    u64a and_mask;
+    u64a cmp_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask(u64a and_mask_in, u64a cmp_mask_in, u64a neg_mask_in,
+                       s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), neg_mask(neg_mask_in),
+          offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMask &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMask32
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_32,
+                                    ROSE_STRUCT_CHECK_MASK_32,
+                                    RoseInstrCheckMask32> {
+public:
+    std::array<u8, 32> and_mask;
+    std::array<u8, 32> cmp_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask32(std::array<u8, 32> and_mask_in,
+                         std::array<u8, 32> cmp_mask_in, u32 neg_mask_in,
+                         s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(move(and_mask_in)), cmp_mask(move(cmp_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMask32 &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask32 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckByte
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
+                                    ROSE_STRUCT_CHECK_BYTE,
+                                    RoseInstrCheckByte> {
+public:
+    u8 and_mask;
+    u8 cmp_mask;
+    u8 negation;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckByte(u8 and_mask_in, u8 cmp_mask_in, u8 negation_in,
+                       s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), negation(negation_in),
+          offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckByte &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               negation == ri.negation && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, negation,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckByte &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               negation == ri.negation && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti16x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_16x8,
+                                    RoseInstrCheckShufti16x8> {
+public:
+    std::array<u8, 32> nib_mask;
+    std::array<u8, 16> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti16x8(std::array<u8, 32> nib_mask_in,
+                             std::array<u8, 16> bucket_select_mask_in,
+                             u32 neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : nib_mask(move(nib_mask_in)),
+          bucket_select_mask(move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti16x8 &ri) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), nib_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti16x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti32x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_32x8,
+                                    RoseInstrCheckShufti32x8> {
+public:
+    std::array<u8, 16> hi_mask;
+    std::array<u8, 16> lo_mask;
+    std::array<u8, 32> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti32x8(std::array<u8, 16> hi_mask_in,
+                             std::array<u8, 16> lo_mask_in,
+                             std::array<u8, 32> bucket_select_mask_in,
+                             u32 neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
+          bucket_select_mask(move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti32x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti32x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti16x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_16x16,
+                                    RoseInstrCheckShufti16x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 32> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti16x16(std::array<u8, 32> hi_mask_in,
+                              std::array<u8, 32> lo_mask_in,
+                              std::array<u8, 32> bucket_select_mask_in,
+                              u32 neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
+          bucket_select_mask(move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti16x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti16x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti32x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_32x16,
+                                    RoseInstrCheckShufti32x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 32> bucket_select_mask_hi;
+    std::array<u8, 32> bucket_select_mask_lo;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti32x16(std::array<u8, 32> hi_mask_in,
+                              std::array<u8, 32> lo_mask_in,
+                              std::array<u8, 32> bucket_select_mask_hi_in,
+                              std::array<u8, 32> bucket_select_mask_lo_in,
+                              u32 neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
+          bucket_select_mask_hi(move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(move(bucket_select_mask_lo_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti32x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask_hi, bucket_select_mask_lo,
+                        neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti32x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckInfix
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
+                                    ROSE_STRUCT_CHECK_INFIX,
+                                    RoseInstrCheckInfix> {
+public:
+    u32 queue;
+    u32 lag;
+    ReportID report;
+    const RoseInstruction *target;
+
+    RoseInstrCheckInfix(u32 queue_in, u32 lag_in, ReportID report_in,
+                        const RoseInstruction *target_in)
+        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckInfix &ri) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag, report);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckInfix &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckPrefix
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_PREFIX,
+                                    ROSE_STRUCT_CHECK_PREFIX,
+                                    RoseInstrCheckPrefix> {
+public:
+    u32 queue;
+    u32 lag;
+    ReportID report;
+    const RoseInstruction *target;
+
+    RoseInstrCheckPrefix(u32 queue_in, u32 lag_in, ReportID report_in,
+                         const RoseInstruction *target_in)
+        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckPrefix &ri) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag, report);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckPrefix &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrPushDelayed
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_PUSH_DELAYED,
+                                    ROSE_STRUCT_PUSH_DELAYED,
+                                    RoseInstrPushDelayed> {
+public:
+    u8 delay;
+    u32 index;
+
+    RoseInstrPushDelayed(u8 delay_in, u32 index_in)
+        : delay(delay_in), index(index_in) {}
+
+    bool operator==(const RoseInstrPushDelayed &ri) const {
+        return delay == ri.delay && index == ri.index;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), delay, index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrPushDelayed &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return delay == ri.delay && index == ri.index;
+    }
+};
+
+class RoseInstrRecordAnchored
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_RECORD_ANCHORED,
+                                    ROSE_STRUCT_RECORD_ANCHORED,
+                                    RoseInstrRecordAnchored> {
+public:
+    u32 id;
+
+    explicit RoseInstrRecordAnchored(u32 id_in) : id(id_in) {}
+
+    bool operator==(const RoseInstrRecordAnchored &ri) const {
+        return id == ri.id;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), id);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrRecordAnchored &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return id == ri.id;
+    }
+};
+
+class RoseInstrCatchUp
+    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP, ROSE_STRUCT_CATCH_UP,
+                                  RoseInstrCatchUp> {
+public:
+    ~RoseInstrCatchUp() override;
+};
+
+class RoseInstrCatchUpMpv
+    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP_MPV,
+                                  ROSE_STRUCT_CATCH_UP_MPV,
+                                  RoseInstrCatchUpMpv> {
+public:
+    ~RoseInstrCatchUpMpv() override;
+};
+
+class RoseInstrSomAdjust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_ADJUST,
+                                    ROSE_STRUCT_SOM_ADJUST,
+                                    RoseInstrSomAdjust> {
+public:
+    u32 distance;
+
+    explicit RoseInstrSomAdjust(u32 distance_in) : distance(distance_in) {}
+
+    bool operator==(const RoseInstrSomAdjust &ri) const {
+        return distance == ri.distance;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), distance);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomAdjust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return distance == ri.distance;
+    }
+};
+
+class RoseInstrSomLeftfix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_LEFTFIX,
+                                    ROSE_STRUCT_SOM_LEFTFIX,
+                                    RoseInstrSomLeftfix> {
+public:
+    u32 queue;
+    u32 lag;
+
+    RoseInstrSomLeftfix(u32 queue_in, u32 lag_in)
+        : queue(queue_in), lag(lag_in) {}
+
+    bool operator==(const RoseInstrSomLeftfix &ri) const {
+        return queue == ri.queue && lag == ri.lag;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomLeftfix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return queue == ri.queue && lag == ri.lag;
+    }
+};
+
+class RoseInstrSomFromReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_FROM_REPORT,
+                                    ROSE_STRUCT_SOM_FROM_REPORT,
+                                    RoseInstrSomFromReport> {
+public:
+    som_operation som;
+
+    RoseInstrSomFromReport() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrSomFromReport &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomFromReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrSomZero
+    : public RoseInstrBaseTrivial<ROSE_INSTR_SOM_ZERO, ROSE_STRUCT_SOM_ZERO,
+                                  RoseInstrSomZero> {
+public:
+    ~RoseInstrSomZero() override;
+};
+
+class RoseInstrTriggerInfix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_INFIX,
+                                    ROSE_STRUCT_TRIGGER_INFIX,
+                                    RoseInstrTriggerInfix> {
+public:
+    u8 cancel;
+    u32 queue;
+    u32 event;
+
+    RoseInstrTriggerInfix(u8 cancel_in, u32 queue_in, u32 event_in)
+        : cancel(cancel_in), queue(queue_in), event(event_in) {}
+
+    bool operator==(const RoseInstrTriggerInfix &ri) const {
+        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), cancel, queue, event);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrTriggerInfix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
+    }
+};
+
+class RoseInstrTriggerSuffix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_SUFFIX,
+                                    ROSE_STRUCT_TRIGGER_SUFFIX,
+                                    RoseInstrTriggerSuffix> {
+public:
+    u32 queue;
+    u32 event;
+
+    RoseInstrTriggerSuffix(u32 queue_in, u32 event_in)
+        : queue(queue_in), event(event_in) {}
+
+    bool operator==(const RoseInstrTriggerSuffix &ri) const {
+        return queue == ri.queue && event == ri.event;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, event);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrTriggerSuffix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return queue == ri.queue && event == ri.event;
+    }
+};
+
+class RoseInstrDedupe
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE, ROSE_STRUCT_DEDUPE,
+                                    RoseInstrDedupe> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupe(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
+                    const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupe &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupe &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrDedupeSom
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_SOM,
+                                    ROSE_STRUCT_DEDUPE_SOM,
+                                    RoseInstrDedupeSom> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupeSom(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
+                       const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupeSom &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupeSom &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrReportChain
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_CHAIN,
+                                    ROSE_STRUCT_REPORT_CHAIN,
+                                    RoseInstrReportChain> {
+public:
+    u32 event;
+    u64a top_squash_distance;
+
+    RoseInstrReportChain(u32 event_in, u32 top_squash_distance_in)
+        : event(event_in), top_squash_distance(top_squash_distance_in) {}
+
+    bool operator==(const RoseInstrReportChain &ri) const {
+        return event == ri.event &&
+               top_squash_distance == ri.top_squash_distance;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), event, top_squash_distance);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportChain &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return event == ri.event &&
+               top_squash_distance == ri.top_squash_distance;
+    }
+};
+
+class RoseInstrReportSomInt
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_INT,
+                                    ROSE_STRUCT_REPORT_SOM_INT,
+                                    RoseInstrReportSomInt> {
+public:
+    som_operation som;
+
+    RoseInstrReportSomInt() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrReportSomInt &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomInt &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrReportSomAware
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_AWARE,
+                                    ROSE_STRUCT_REPORT_SOM_AWARE,
+                                    RoseInstrReportSomAware> {
+public:
+    som_operation som;
+
+    RoseInstrReportSomAware() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrReportSomAware &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomAware &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT, ROSE_STRUCT_REPORT,
+                                    RoseInstrReport> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrReport(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrReport &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrReportExhaust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_EXHAUST,
+                                    ROSE_STRUCT_REPORT_EXHAUST,
+                                    RoseInstrReportExhaust> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+    u32 ekey;
+
+    RoseInstrReportExhaust(ReportID onmatch_in, s32 offset_adjust_in,
+                           u32 ekey_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
+
+    bool operator==(const RoseInstrReportExhaust &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportExhaust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+};
+
+class RoseInstrReportSom
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM,
+                                    ROSE_STRUCT_REPORT_SOM,
+                                    RoseInstrReportSom> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrReportSom(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrReportSom &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSom &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrReportSomExhaust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_EXHAUST,
+                                    ROSE_STRUCT_REPORT_SOM_EXHAUST,
+                                    RoseInstrReportSomExhaust> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+    u32 ekey;
+
+    RoseInstrReportSomExhaust(ReportID onmatch_in, s32 offset_adjust_in,
+                              u32 ekey_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
+
+    bool operator==(const RoseInstrReportSomExhaust &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomExhaust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+};
+
+class RoseInstrDedupeAndReport
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_AND_REPORT,
+                                    ROSE_STRUCT_DEDUPE_AND_REPORT,
+                                    RoseInstrDedupeAndReport> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    ReportID onmatch;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupeAndReport(u8 quash_som_in, u32 dkey_in, ReportID onmatch_in,
+                             s32 offset_adjust_in,
+                             const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in), onmatch(onmatch_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupeAndReport &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey, onmatch,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupeAndReport &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrFinalReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_FINAL_REPORT,
+                                    ROSE_STRUCT_FINAL_REPORT,
+                                    RoseInstrFinalReport> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrFinalReport(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrFinalReport &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrFinalReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrCheckExhausted
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_EXHAUSTED,
+                                    ROSE_STRUCT_CHECK_EXHAUSTED,
+                                    RoseInstrCheckExhausted> {
+public:
+    u32 ekey;
+    const RoseInstruction *target;
+
+    RoseInstrCheckExhausted(u32 ekey_in, const RoseInstruction *target_in)
+        : ekey(ekey_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckExhausted &ri) const {
+        return ekey == ri.ekey && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckExhausted &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return ekey == ri.ekey &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMinLength
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MIN_LENGTH,
+                                    ROSE_STRUCT_CHECK_MIN_LENGTH,
+                                    RoseInstrCheckMinLength> {
+public:
+    s32 end_adj;
+    u64a min_length;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMinLength(s32 end_adj_in, u64a min_length_in,
+                            const RoseInstruction *target_in)
+        : end_adj(end_adj_in), min_length(min_length_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMinLength &ri) const {
+        return end_adj == ri.end_adj && min_length == ri.min_length &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), end_adj, min_length);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMinLength &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return end_adj == ri.end_adj && min_length == ri.min_length &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSetState
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_STATE, ROSE_STRUCT_SET_STATE,
+                                    RoseInstrSetState> {
+public:
+    u32 index;
+
+    explicit RoseInstrSetState(u32 index_in) : index(index_in) {}
+
+    bool operator==(const RoseInstrSetState &ri) const {
+        return index == ri.index;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSetState &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return index == ri.index;
+    }
+};
+
+class RoseInstrSetGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_GROUPS,
+                                    ROSE_STRUCT_SET_GROUPS,
+                                    RoseInstrSetGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrSetGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrSetGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSetGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrSquashGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SQUASH_GROUPS,
+                                    ROSE_STRUCT_SQUASH_GROUPS,
+                                    RoseInstrSquashGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrSquashGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrSquashGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSquashGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrCheckState
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_STATE,
+                                    ROSE_STRUCT_CHECK_STATE,
+                                    RoseInstrCheckState> {
+public:
+    u32 index;
+    const RoseInstruction *target;
+
+    RoseInstrCheckState(u32 index_in, const RoseInstruction *target_in)
+        : index(index_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckState &ri) const {
+        return index == ri.index && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckState &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return index == ri.index &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSparseIterBegin
+    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_BEGIN,
+                           ROSE_STRUCT_SPARSE_ITER_BEGIN,
+                           RoseInstrSparseIterBegin> {
+public:
+    u32 num_keys; // total number of multibit keys
+    std::vector<std::pair<u32, const RoseInstruction *>> jump_table;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterBegin(u32 num_keys_in,
+                             const RoseInstruction *target_in)
+        : num_keys(num_keys_in), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterBegin &ri) const {
+        return num_keys == ri.num_keys && jump_table == ri.jump_table &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        size_t v = hash_all(static_cast<int>(opcode), num_keys);
+        for (const u32 &key : jump_table | boost::adaptors::map_keys) {
+            boost::hash_combine(v, key);
+        }
+        return v;
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        if (target == old_target) {
+            target = new_target;
+        }
+        for (auto &jump : jump_table) {
+            if (jump.second == old_target) {
+                jump.second = new_target;
+            }
+        }
+    }
+
+    bool equiv_to(const RoseInstrSparseIterBegin &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        if (iter_offset != ri.iter_offset ||
+            offsets.at(target) != other_offsets.at(ri.target)) {
+            return false;
+        }
+        if (jump_table.size() != ri.jump_table.size()) {
+            return false;
+        }
+        auto it1 = jump_table.begin(), it2 = ri.jump_table.begin();
+        for (; it1 != jump_table.end(); ++it1, ++it2) {
+            if (it1->first != it2->first) {
+                return false;
+            }
+            if (offsets.at(it1->second) != other_offsets.at(it2->second)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+private:
+    friend class RoseInstrSparseIterNext;
+
+    // These variables allow us to use the same multibit iterator and jump
+    // table in subsequent SPARSE_ITER_NEXT write() operations.
+    mutable bool is_written = false;
+    mutable u32 iter_offset = 0;
+    mutable u32 jump_table_offset = 0;
+};
+
+class RoseInstrSparseIterNext
+    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_NEXT,
+                           ROSE_STRUCT_SPARSE_ITER_NEXT,
+                           RoseInstrSparseIterNext> {
+public:
+    u32 state;
+    const RoseInstrSparseIterBegin *begin;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterNext(u32 state_in,
+                            const RoseInstrSparseIterBegin *begin_in,
+                            const RoseInstruction *target_in)
+        : state(state_in), begin(begin_in), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterNext &ri) const {
+        return state == ri.state && begin == ri.begin && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), state);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        if (target == old_target) {
+            target = new_target;
+        }
+        if (begin == old_target) {
+            assert(new_target->code() == ROSE_INSTR_SPARSE_ITER_BEGIN);
+            begin = static_cast<const RoseInstrSparseIterBegin *>(new_target);
+        }
+    }
+
+    bool equiv_to(const RoseInstrSparseIterNext &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return state == ri.state &&
+               offsets.at(begin) == other_offsets.at(ri.begin) &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSparseIterAny
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_SPARSE_ITER_ANY,
+                                    ROSE_STRUCT_SPARSE_ITER_ANY,
+                                    RoseInstrSparseIterAny> {
+public:
+    u32 num_keys; // total number of multibit keys
+    std::vector<u32> keys;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterAny(u32 num_keys_in, std::vector<u32> keys_in,
+                           const RoseInstruction *target_in)
+        : num_keys(num_keys_in), keys(std::move(keys_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterAny &ri) const {
+        return num_keys == ri.num_keys && keys == ri.keys &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), num_keys, keys);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSparseIterAny &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return num_keys == ri.num_keys && keys == ri.keys &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrEnginesEod
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_ENGINES_EOD,
+                                    ROSE_STRUCT_ENGINES_EOD,
+                                    RoseInstrEnginesEod> {
+public:
+    u32 iter_offset;
+
+    explicit RoseInstrEnginesEod(u32 iter_in) : iter_offset(iter_in) {}
+
+    bool operator==(const RoseInstrEnginesEod &ri) const {
+        return iter_offset == ri.iter_offset;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), iter_offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrEnginesEod &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return iter_offset == ri.iter_offset;
+    }
+};
+
+class RoseInstrSuffixesEod
+    : public RoseInstrBaseTrivial<ROSE_INSTR_SUFFIXES_EOD,
+                                  ROSE_STRUCT_SUFFIXES_EOD,
+                                  RoseInstrSuffixesEod> {
+public:
+    ~RoseInstrSuffixesEod() override;
+};
+
+class RoseInstrMatcherEod : public RoseInstrBaseTrivial<ROSE_INSTR_MATCHER_EOD,
+                                                        ROSE_STRUCT_MATCHER_EOD,
+                                                        RoseInstrMatcherEod> {
+public:
+    ~RoseInstrMatcherEod() override;
+};
+
+class RoseInstrCheckLongLit
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LONG_LIT,
+                                    ROSE_STRUCT_CHECK_LONG_LIT,
+                                    RoseInstrCheckLongLit> {
+public:
+    std::string literal;
+
+    RoseInstrCheckLongLit(std::string literal_in)
+        : literal(std::move(literal_in)) {}
+
+    bool operator==(const RoseInstrCheckLongLit &ri) const {
+        return literal == ri.literal;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLongLit &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return literal == ri.literal;
+    }
+};
+
+class RoseInstrCheckLongLitNocase
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
+                                    ROSE_STRUCT_CHECK_LONG_LIT_NOCASE,
+                                    RoseInstrCheckLongLitNocase> {
+public:
+    std::string literal;
+
+    RoseInstrCheckLongLitNocase(std::string literal_in)
+        : literal(std::move(literal_in)) {
+        upperString(literal);
+    }
+
+    bool operator==(const RoseInstrCheckLongLitNocase &ri) const {
+        return literal == ri.literal;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLongLitNocase &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return literal == ri.literal;
+    }
+};
+
+class RoseInstrEnd
+    : public RoseInstrBaseTrivial<ROSE_INSTR_END, ROSE_STRUCT_END,
+                                  RoseInstrEnd> {
+public:
+    ~RoseInstrEnd() override;
+};
+
+/**
+ * \brief Container for a list of program instructions.
+ */
+class RoseProgram {
+private:
+    std::vector<std::unique_ptr<RoseInstruction>> prog;
+
+public:
+    RoseProgram() {
+        prog.push_back(make_unique<RoseInstrEnd>());
+    }
+
+    bool empty() const {
+        assert(!prog.empty());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+        // Empty if we only have one element, the END instruction.
+        return std::next(prog.begin()) == prog.end();
+    }
+
+    size_t size() const { return prog.size(); }
+
+    const RoseInstruction &back() const { return *prog.back(); }
+    const RoseInstruction &front() const { return *prog.front(); }
+
+    using iterator = decltype(prog)::iterator;
+    iterator begin() { return prog.begin(); }
+    iterator end() { return prog.end(); }
+
+    using const_iterator = decltype(prog)::const_iterator;
+    const_iterator begin() const { return prog.begin(); }
+    const_iterator end() const { return prog.end(); }
+
+    using reverse_iterator = decltype(prog)::reverse_iterator;
+    reverse_iterator rbegin() { return prog.rbegin(); }
+    reverse_iterator rend() { return prog.rend(); }
+
+    using const_reverse_iterator = decltype(prog)::const_reverse_iterator;
+    const_reverse_iterator rbegin() const { return prog.rbegin(); }
+    const_reverse_iterator rend() const { return prog.rend(); }
+
+    /** \brief Retrieve a pointer to the terminating ROSE_INSTR_END. */
+    const RoseInstruction *end_instruction() const {
+        assert(!prog.empty());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        return prog.back().get();
+    }
+
+private:
+    static void update_targets(iterator it, iterator it_end,
+                               const RoseInstruction *old_target,
+                               const RoseInstruction *new_target) {
+        assert(old_target && new_target && old_target != new_target);
+        for (; it != it_end; ++it) {
+            std::unique_ptr<RoseInstruction> &ri = *it;
+            assert(ri);
+            ri->update_target(old_target, new_target);
+        }
+    }
+
+public:
+    iterator insert(iterator it, std::unique_ptr<RoseInstruction> ri) {
+        assert(!prog.empty());
+        assert(it != end());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        return prog.insert(it, std::move(ri));
+    }
+
+    iterator insert(iterator it, RoseProgram &&block) {
+        assert(!prog.empty());
+        assert(it != end());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        if (block.empty()) {
+            return it;
+        }
+
+        const RoseInstruction *end_ptr = block.end_instruction();
+        assert(end_ptr->code() == ROSE_INSTR_END);
+        block.prog.pop_back();
+
+        const RoseInstruction *new_target = it->get();
+        update_targets(block.prog.begin(), block.prog.end(), end_ptr,
+                       new_target);
+
+        // Workaround: container insert() for ranges doesn't return an iterator
+        // in the version of the STL distributed with gcc 4.8.
+        auto dist = distance(prog.begin(), it);
+        prog.insert(it, std::make_move_iterator(block.prog.begin()),
+                    std::make_move_iterator(block.prog.end()));
+        it = prog.begin();
+        std::advance(it, dist);
+        return it;
+    }
+
+    /**
+     * \brief Adds this instruction to the program just before the terminating
+     * ROSE_INSTR_END.
+     */
+    void add_before_end(std::unique_ptr<RoseInstruction> ri) {
+        assert(!prog.empty());
+        insert(std::prev(prog.end()), std::move(ri));
+    }
+
+    /**
+     * \brief Adds this block to the program just before the terminating
+     * ROSE_INSTR_END.
+     */
+    void add_before_end(RoseProgram &&block) {
+        assert(!prog.empty());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        if (block.empty()) {
+            return;
+        }
+
+        insert(std::prev(prog.end()), std::move(block));
+    }
+
+    /**
+     * \brief Append this program block, replacing our current ROSE_INSTR_END.
+     */
+    void add_block(RoseProgram &&block) {
+        assert(!prog.empty());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        if (block.empty()) {
+            return;
+        }
+
+        // Replace pointers to the current END with pointers to the first
+        // instruction in the new sequence.
+        const RoseInstruction *end_ptr = end_instruction();
+        prog.pop_back();
+        update_targets(prog.begin(), prog.end(), end_ptr,
+                       block.prog.front().get());
+        prog.insert(prog.end(), std::make_move_iterator(block.prog.begin()),
+                    std::make_move_iterator(block.prog.end()));
+    }
+
+    /**
+     * \brief Replace the instruction pointed to by the given iterator.
+     */
+    template<class Iter>
+    void replace(Iter it, std::unique_ptr<RoseInstruction> ri) {
+        assert(!prog.empty());
+        assert(prog.back()->code() == ROSE_INSTR_END);
+
+        const RoseInstruction *old_ptr = it->get();
+        *it = move(ri);
+        update_targets(prog.begin(), prog.end(), old_ptr, it->get());
+
+        assert(prog.back()->code() == ROSE_INSTR_END);
+    }
+};
+
+aligned_unique_ptr<char>
+writeProgram(RoseEngineBlob &blob, const RoseProgram &program, u32 *total_len);
+
+class RoseProgramHash {
+public:
+    size_t operator()(const RoseProgram &program) const {
+        size_t v = 0;
+        for (const auto &ri : program) {
+            assert(ri);
+            boost::hash_combine(v, ri->hash());
+        }
+        return v;
+    }
+};
+
+class RoseProgramEquivalence {
+public:
+    bool operator()(const RoseProgram &prog1, const RoseProgram &prog2) const;
+};
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_PROGRAM_H
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index c2366f0e..c6139097 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -40,7 +40,6 @@
 #include "nfagraph/ng_is_equal.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_prune.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_uncalc_components.h"
 #include "nfagraph/ng_util.h"
 #include "util/bitutils.h"
@@ -112,11 +111,9 @@ struct AliasInEdge : EdgeAndVertex {
 
 class CandidateSet {
 public:
-    typedef RoseVertexSet::iterator iterator;
+    typedef set<RoseVertex>::iterator iterator;
     typedef RoseVertex key_type;
 
-    explicit CandidateSet(const VertexIndexComp &comp) : main_cont(comp) {}
-
     iterator begin() { return main_cont.begin(); }
     iterator end() { return main_cont.end(); }
 
@@ -152,7 +149,7 @@ public:
 
 private:
     /* if a vertex is worth storing, it is worth storing twice */
-    RoseVertexSet main_cont; /* deterministic iterator */
+    set<RoseVertex> main_cont; /* deterministic iterator */
     ue2::unordered_set<RoseVertex> hash_cont; /* member checks */
 };
 
@@ -257,10 +254,8 @@ bool samePredecessors(RoseVertex a, RoseVertex b, const RoseGraph &g) {
         }
 
         for (const auto &e_a : in_edges_range(a, g)) {
-            bool exists;
-            RoseEdge e;
-            tie(e, exists) = edge_by_target(source(e_a, g), b, g);
-            if (!exists || g[e].rose_top != g[e_a].rose_top) {
+            RoseEdge e = edge(source(e_a, g), b, g);
+            if (!e || g[e].rose_top != g[e_a].rose_top) {
                 DEBUG_PRINTF("bad tops\n");
                 return false;
             }
@@ -274,10 +269,7 @@ static
 bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b,
                                 const RoseGraph &g) {
     for (const auto &e_a : out_edges_range(a, g)) {
-        bool exists;
-        RoseEdge e;
-        tie(e, exists) = edge(b, target(e_a, g), g);
-        if (exists) {
+        if (RoseEdge e = edge(b, target(e_a, g), g)) {
             if (g[e_a].maxBound < g[e].minBound
                 || g[e].maxBound < g[e_a].minBound) {
                 return true;
@@ -296,10 +288,7 @@ static
 bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b,
                                 const RoseGraph &g) {
     for (const auto &e_a : in_edges_range(a, g)) {
-        bool exists;
-        RoseEdge e;
-        tie(e, exists) = edge_by_target(source(e_a, g), b, g);
-        if (exists) {
+        if (RoseEdge e = edge(source(e_a, g), b, g)) {
             if (g[e_a].maxBound < g[e].minBound
                 || g[e].maxBound < g[e_a].minBound) {
                 return true;
@@ -499,11 +488,11 @@ void mergeEdgeAdd(RoseVertex u, RoseVertex v, const RoseEdge &from_edge,
     const RoseEdgeProps &from_props = g[from_edge];
 
     if (!to_edge) {
-        DEBUG_PRINTF("adding edge [%zu,%zu]\n", g[u].idx, g[v].idx);
+        DEBUG_PRINTF("adding edge [%zu,%zu]\n", g[u].index, g[v].index);
         add_edge(u, v, from_props, g);
     } else {
         // union of the two edges.
-        DEBUG_PRINTF("updating edge [%zu,%zu]\n", g[u].idx, g[v].idx);
+        DEBUG_PRINTF("updating edge [%zu,%zu]\n", g[u].index, g[v].index);
         RoseEdgeProps &to_props = g[*to_edge];
         to_props.minBound = min(to_props.minBound, from_props.minBound);
         to_props.maxBound = max(to_props.maxBound, from_props.maxBound);
@@ -627,7 +616,7 @@ static
 void mergeVerticesLeft(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
                        RoseAliasingInfo &rai) {
     RoseGraph &g = build.g;
-    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].index, g[b].index);
 
     insert(&g[b].reports, g[a].reports);
 
@@ -649,7 +638,7 @@ static
 void mergeVerticesRight(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
                         RoseAliasingInfo &rai) {
     RoseGraph &g = build.g;
-    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].index, g[b].index);
 
     insert(&g[b].reports, g[a].reports);
     g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
@@ -667,7 +656,7 @@ static
 void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
                           RoseAliasingInfo &rai) {
     RoseGraph &g = build.g;
-    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].index, g[b].index);
 
     // For a diamond merge, most properties are already the same (with the
     // notable exception of the literal set).
@@ -684,7 +673,7 @@ static never_inline
 void findCandidates(const RoseBuildImpl &build, CandidateSet *candidates) {
     for (auto v : vertices_range(build.g)) {
         if (isAliasingCandidate(v, build)) {
-            DEBUG_PRINTF("candidate %zu\n", build.g[v].idx);
+            DEBUG_PRINTF("candidate %zu\n", build.g[v].index);
             DEBUG_PRINTF("lits: %u\n", *build.g[v].literals.begin());
             candidates->insert(v);
         }
@@ -747,10 +736,7 @@ bool hasCommonPredWithDiffRoses(RoseVertex a, RoseVertex b,
     const bool equal_roses = hasEqualLeftfixes(a, b, g);
 
     for (const auto &e_a : in_edges_range(a, g)) {
-        bool exists;
-        RoseEdge e;
-        tie(e, exists) = edge_by_target(source(e_a, g), b, g);
-        if (exists) {
+        if (RoseEdge e = edge(source(e_a, g), b, g)) {
             DEBUG_PRINTF("common pred, e_r=%d r_t %u,%u\n",
                          (int)equal_roses, g[e].rose_top, g[e_a].rose_top);
             if (!equal_roses) {
@@ -786,8 +772,8 @@ void pruneReportIfUnused(const RoseBuildImpl &build, shared_ptr<NGHolder> h,
         // unimplementable.
 
         DEBUG_PRINTF("report %u has been merged away, pruning\n", report);
-        assert(h->kind == build.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
-                                                                : NFA_INFIX);
+        assert(h->kind == (build.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
+                                                                 : NFA_INFIX));
         unique_ptr<NGHolder> h_new = cloneHolder(*h);
         pruneReport(*h_new, report);
 
@@ -863,7 +849,13 @@ void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
 static
 void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
                      const set<RoseVertex> &verts) {
-    ue2::unordered_set<u32> used_tops;
+    if (!is_triggered(h)) {
+        DEBUG_PRINTF("not triggered, no tops\n");
+        return;
+    }
+    assert(isCorrectlyTopped(h));
+    DEBUG_PRINTF("prunning unused tops\n");
+    ue2::flat_set<u32> used_tops;
     for (auto v : verts) {
         assert(g[v].left.graph.get() == &h);
 
@@ -879,10 +871,13 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
         if (v == h.startDs) {
             continue; // stylised edge, leave it alone.
         }
-        u32 top = h[e].top;
-        if (!contains(used_tops, top)) {
-            DEBUG_PRINTF("edge (start,%u) has unused top %u\n",
-                          h[v].index, top);
+        flat_set<u32> pruned_tops;
+        auto pt_inserter = inserter(pruned_tops, pruned_tops.end());
+        set_intersection(h[e].tops.begin(), h[e].tops.end(),
+                         used_tops.begin(), used_tops.end(), pt_inserter);
+        h[e].tops = move(pruned_tops);
+        if (h[e].tops.empty()) {
+            DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
             dead.push_back(e);
         }
     }
@@ -1116,8 +1111,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         // We should be protected from merging common preds with tops leading
         // to completely different repeats by earlier checks, but just in
         // case...
-        if (edge(source(e, g), a, g).second) {
-            RoseEdge a_edge = edge(source(e, g), a, g).first;
+        if (RoseEdge a_edge = edge(source(e, g), a, g)) {
             u32 a_top = g[a_edge].rose_top;
             const PureRepeat &a_pr = m_castle->repeats[a_top]; // new report
             if (pr != a_pr) {
@@ -1287,7 +1281,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
     }
 
     DEBUG_PRINTF("attempting merge of roses on vertices %zu and %zu\n",
-                 g[a].idx, g[b].idx);
+                 g[a].index, g[b].index);
 
     set<RoseVertex> &b_verts = rai.rev_leftfix[b_left];
     set<RoseVertex> aa;
@@ -1327,8 +1321,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
     DEBUG_PRINTF("winner %zu states\n", num_vertices(*b_h));
 
     if (!setDistinctRoseTops(g, victim, *b_h, deque<RoseVertex>(1, a))) {
-        assert(roseHasTops(g, a));
-        assert(roseHasTops(g, b));
+        assert(roseHasTops(build, a));
+        assert(roseHasTops(build, b));
         return false;
     }
 
@@ -1341,8 +1335,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         for (const auto &e : in_edges_range(a, g)) {
             g[e] = a_props[source(e, g)];
         }
-        assert(roseHasTops(g, a));
-        assert(roseHasTops(g, b));
+        assert(roseHasTops(build, a));
+        assert(roseHasTops(build, b));
         return false;
     }
 
@@ -1365,8 +1359,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
 
     reduceImplementableGraph(*b_h, SOM_NONE, nullptr, build.cc);
 
-    assert(roseHasTops(g, a));
-    assert(roseHasTops(g, b));
+    assert(roseHasTops(build, a));
+    assert(roseHasTops(build, b));
     assert(isImplementableNFA(*b_h, nullptr, build.cc));
     return true;
 }
@@ -1379,7 +1373,7 @@ bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                       RoseVertex b, bool trivialCasesOnly,
                       RoseAliasingInfo &rai) {
     DEBUG_PRINTF("attempting rose merge, vertices a=%zu, b=%zu\n",
-                  build.g[a].idx, build.g[b].idx);
+                  build.g[a].index, build.g[b].index);
     assert(a != b);
 
     RoseGraph &g = build.g;
@@ -1417,8 +1411,8 @@ bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         return false;
     }
 
-    assert(roseHasTops(g, a));
-    assert(roseHasTops(g, b));
+    assert(roseHasTops(build, a));
+    assert(roseHasTops(build, b));
 
     if (a_left_id.graph() && b_left_id.graph()) {
         return attemptRoseGraphMerge(build, preds_same, a, b, trivialCasesOnly,
@@ -1592,7 +1586,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &build,
 
             assert(contains(candidates, a));
 
-            DEBUG_PRINTF("trying to merge %zu into somebody\n", g[a].idx);
+            DEBUG_PRINTF("trying to merge %zu into somebody\n", g[a].index);
             for (auto jt = it; jt != siblings.end(); ++jt) {
                 RoseVertex b = *jt;
                 assert(contains(candidates, b));
@@ -1706,8 +1700,8 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
         RoseVertex pred = pickPred(a, g, build);
 
         siblings.clear();
-        if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred) ||
-                    hasGreaterOutDegree(verts.size(), pred, g)) {
+        if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred)
+            || out_degree(pred, g) > verts.size()) {
             // Select sibling from amongst the vertices that share a literal.
             siblings.insert(siblings.end(), verts.begin(), verts.end());
         } else {
@@ -1716,8 +1710,6 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
             insert(&siblings, siblings.end(), adjacent_vertices(pred, g));
         }
 
-        sort(siblings.begin(), siblings.end(), VertexIndexComp(g));
-
         auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a,
                                        build, rai, candidates);
         if (jt == siblings.end()) {
@@ -1737,6 +1729,7 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
     }
 
     DEBUG_PRINTF("%zu candidates remaining\n", candidates.size());
+    assert(!hasOrphanedTops(build));
 }
 
 // Can't merge vertices with different root predecessors.
@@ -1745,12 +1738,12 @@ bool safeRootPreds(RoseVertex a, RoseVertex b, const RoseGraph &g) {
     set<RoseVertex> a_roots, b_roots;
 
     for (auto u : inv_adjacent_vertices_range(a, g)) {
-        if (!hasGreaterInDegree(0, u, g)) {
+        if (!in_degree(u, g)) {
             a_roots.insert(u);
         }
     }
     for (auto u : inv_adjacent_vertices_range(b, g)) {
-        if (!hasGreaterInDegree(0, u, g)) {
+        if (!in_degree(u, g)) {
             b_roots.insert(u);
         }
     }
@@ -1858,8 +1851,8 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &build,
         u32 lit_id = *g[a].literals.begin();
         RoseVertex succ = pickSucc(a, g);
         const auto &verts = build.literal_info.at(lit_id).vertices;
-        if (succ != RoseGraph::null_vertex() &&
-                !hasGreaterInDegree(verts.size(), succ, g)) {
+        if (succ != RoseGraph::null_vertex()
+            && in_degree(succ, g) < verts.size()) {
             if (!done_succ.insert(succ).second) {
                 continue; // succ already in done_succ.
             }
@@ -1892,7 +1885,7 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &build,
     }
 
     for (auto &siblings : sibling_cache | map_values) {
-        sort(siblings.begin(), siblings.end(), VertexIndexComp(build.g));
+        sort(siblings.begin(), siblings.end());
     }
 }
 
@@ -1952,6 +1945,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
     }
 
     DEBUG_PRINTF("%zu candidates remaining\n", candidates.size());
+    assert(!hasOrphanedTops(build));
 }
 
 /**
@@ -1966,7 +1960,7 @@ bool hasNoDiamondSiblings(const RoseGraph &g, RoseVertex v) {
     if (has_successor(v, g)) {
         bool only_succ = true;
         for (const auto &w : adjacent_vertices_range(v, g)) {
-            if (hasGreaterInDegree(1, w, g)) {
+            if (in_degree(w, g) > 1) {
                 only_succ = false;
                 break;
             }
@@ -1982,7 +1976,7 @@ bool hasNoDiamondSiblings(const RoseGraph &g, RoseVertex v) {
 
     bool only_pred = true;
     for (const auto &u : inv_adjacent_vertices_range(v, g)) {
-        if (hasGreaterOutDegree(1, u, g)) {
+        if (out_degree(u, g) > 1) {
             only_pred = false;
             break;
         }
@@ -2017,6 +2011,8 @@ void filterDiamondCandidates(RoseGraph &g, CandidateSet &candidates) {
 void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
     const CompileContext &cc = build.cc;
     RoseGraph &g = build.g;
+    assert(!hasOrphanedTops(build));
+    assert(canImplementGraphs(build));
 
     if (!cc.grey.roseRoleAliasing || !cc.grey.roseGraphReduction) {
         return;
@@ -2028,7 +2024,7 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
 
     mergeRoses &= cc.grey.mergeRose & cc.grey.roseMergeRosesDuringAliasing;
 
-    CandidateSet candidates(g);
+    CandidateSet candidates;
     findCandidates(build, &candidates);
 
     DEBUG_PRINTF("candidates %zu\n", candidates.size());
@@ -2050,6 +2046,8 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
 
     DEBUG_PRINTF("killed %zu vertices\n", dead.size());
     build.removeVertices(dead);
+    assert(!hasOrphanedTops(build));
+    assert(canImplementGraphs(build));
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_util.h b/src/rose/rose_build_util.h
index 85cfc010..81bb6845 100644
--- a/src/rose/rose_build_util.h
+++ b/src/rose/rose_build_util.h
@@ -39,31 +39,6 @@ namespace ue2 {
 /** Max allowed width for transient graphs in block mode */
 #define ROSE_BLOCK_TRANSIENT_MAX_WIDTH 255U
 
-// Comparator for vertices using their index property.
-struct VertexIndexComp {
-    VertexIndexComp(const RoseGraph &gg) : g(gg) {}
-
-    bool operator()(const RoseVertex &a, const RoseVertex &b) const {
-        const RoseVertexProps &pa = g[a];
-        const RoseVertexProps &pb = g[b];
-
-        if (pa.idx < pb.idx) {
-            return true;
-        }
-        if (pa.idx > pb.idx) {
-            return false;
-        }
-
-        assert(a == b); // All vertex indices should be distinct.
-        return a < b;
-    }
-
-    const RoseGraph &g;
-};
-
-// Vertex set type, ordered by index. Construct with a graph reference.
-typedef std::set<RoseVertex, VertexIndexComp> RoseVertexSet;
-
 /**
  * \brief Add two Rose depths together, coping correctly with infinity at
  * ROSE_BOUND_INF.
diff --git a/src/rose/rose_build_width.cpp b/src/rose/rose_build_width.cpp
index 6bfcee48..182b62ee 100644
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -77,19 +77,20 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
     u32 minWidth = ROSE_BOUND_INF;
     for (auto v : reachable) {
         if (g[v].eod_accept) {
-            DEBUG_PRINTF("skipping %zu - not a real vertex\n", g[v].idx);
+            DEBUG_PRINTF("skipping %zu - not a real vertex\n", g[v].index);
             continue;
         }
 
         const u32 w = g[v].min_offset;
 
         if (!g[v].reports.empty()) {
-            DEBUG_PRINTF("%zu can fire report at offset %u\n", g[v].idx, w);
+            DEBUG_PRINTF("%zu can fire report at offset %u\n", g[v].index, w);
             minWidth = min(minWidth, w);
         }
 
         if (is_end_anchored(g, v)) {
-            DEBUG_PRINTF("%zu can fire eod report at offset %u\n", g[v].idx, w);
+            DEBUG_PRINTF("%zu can fire eod report at offset %u\n", g[v].index,
+                         w);
             minWidth = min(minWidth, w);
         }
 
@@ -98,7 +99,7 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
             assert(suffix_width.is_reachable());
             DEBUG_PRINTF("%zu has suffix with top %u (width %s), can fire "
                          "report at %u\n",
-                         g[v].idx, g[v].suffix.top, suffix_width.str().c_str(),
+                         g[v].index, g[v].suffix.top, suffix_width.str().c_str(),
                          w + suffix_width);
             minWidth = min(minWidth, w + suffix_width);
         }
@@ -203,10 +204,10 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
     // Everyone's anchored, so the max width can be taken from the max
     // max_offset on our vertices (so long as all accepts are ACCEPT_EOD).
     for (auto v : reachable) {
-        DEBUG_PRINTF("inspecting vert %zu\n", g[v].idx);
+        DEBUG_PRINTF("inspecting vert %zu\n", g[v].index);
 
         if (g[v].eod_accept) {
-            DEBUG_PRINTF("skipping %zu - not a real vertex\n", g[v].idx);
+            DEBUG_PRINTF("skipping %zu - not a real vertex\n", g[v].index);
             continue;
         }
 
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index a3d00943..1867be50 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -42,16 +42,17 @@
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_kind.h"
 #include "util/dump_charclass.h"
-#include "util/multibit_internal.h"
+#include "util/multibit_build.h"
 #include "util/multibit.h"
 
 #include <algorithm>
 #include <fstream>
 #include <iomanip>
 #include <map>
+#include <numeric>
 #include <ostream>
-#include <string>
 #include <sstream>
+#include <string>
 #include <utility>
 
 #ifndef DUMP_SUPPORT
@@ -234,9 +235,12 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
     const char *pc_base = pc;
     for (;;) {
         u8 code = *(const u8 *)pc;
-        assert(code <= ROSE_INSTR_END);
+        assert(code <= LAST_ROSE_INSTRUCTION);
         const size_t offset = pc - pc_base;
         switch (code) {
+            PROGRAM_CASE(END) { return; }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(ANCHORED_DELAY) {
                 os << "    groups 0x" << std::hex << ri->groups << std::dec
                    << endl;
@@ -244,16 +248,6 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(CHECK_LIT_MASK) {
-                os << "    and_mask "
-                   << dumpStrMask(ri->and_mask.a8, sizeof(ri->and_mask.a8))
-                   << endl;
-                os << "    cmp_mask "
-                   << dumpStrMask(ri->cmp_mask.a8, sizeof(ri->cmp_mask.a8))
-                   << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
             PROGRAM_CASE(CHECK_LIT_EARLY) {
                 os << "    min_offset " << ri->min_offset << endl;
             }
@@ -283,6 +277,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) {
+                os << "    offset " << int{ri->offset} << endl;
+                os << "    reach_index " << ri->reach_index << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                const u8 *base = (const u8 *)t;
+                const u8 *reach_base = base + t->lookaroundReachOffset;
+                const u8 *reach = reach_base +
+                                  ri->reach_index * REACH_BITVECTOR_LEN;
+                os << "    contents ";
+                describeClass(os, bitvectorToReach(reach), 1000, CC_OUT_TEXT);
+                os << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_LOOKAROUND) {
                 os << "    index " << ri->index << endl;
                 os << "    count " << ri->count << endl;
@@ -303,6 +311,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK_32) {
+                os << "    and_mask "
+                   << dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
+                   << endl;
+                os << "    cmp_mask "
+                   << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_BYTE) {
                 os << "    and_mask 0x" << std::hex << std::setw(2)
                    << std::setfill('0') << u32{ri->and_mask} << std::dec
@@ -316,6 +338,71 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SHUFTI_16x8) {
+                os << "    nib_mask "
+                   << dumpStrMask(ri->nib_mask, sizeof(ri->nib_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 os << "    queue " << ri->queue << endl;
                 os << "    lag " << ri->lag << endl;
@@ -507,6 +594,12 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(SPARSE_ITER_ANY) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(ENGINES_EOD) {
                 os << "    iter_offset " << ri->iter_offset << endl;
             }
@@ -518,7 +611,22 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             PROGRAM_CASE(MATCHER_EOD) {}
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(END) { return; }
+            PROGRAM_CASE(CHECK_LONG_LIT) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+            }
             PROGRAM_NEXT_INSTRUCTION
 
         default:
@@ -573,9 +681,8 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
     ofstream os(filename);
     const char *base = (const char *)t;
 
-    os << "EOD Program:" << endl;
-
     if (t->eodProgramOffset) {
+        os << "EOD Program @ " << t->eodProgramOffset << ":" << endl;
         dumpProgram(os, t, base + t->eodProgramOffset);
         os << endl;
     } else {
@@ -810,24 +917,14 @@ void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
         const NFA *n = getNfaByInfo(t, nfa_info);
 
-        stringstream sstxt, ssdot, ssraw;
-
-        sstxt << base << "rose_nfa_" << i << ".txt";
-        ssdot << base << "rose_nfa_" << i << ".dot";
-        ssraw << base << "rose_nfa_" << i << ".raw";
-
-        FILE *f;
-
-        f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f, base);
-        fclose(f);
-
-        f = fopen(sstxt.str().c_str(), "w");
-        nfaDumpText(n, f);
-        fclose(f);
+        stringstream ssbase;
+        ssbase << base << "rose_nfa_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
 
         if (dump_raw) {
-            f = fopen(ssraw.str().c_str(), "w");
+            stringstream ssraw;
+            ssraw << base << "rose_nfa_" << i << ".raw";
+            FILE *f = fopen(ssraw.str().c_str(), "w");
             fwrite(n, 1, n->length, f);
             fclose(f);
         }
@@ -870,24 +967,14 @@ void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
     for (u32 i = 0; i < t->somRevCount; i++) {
         const NFA *n = (const NFA *)(tp + rev_offsets[i]);
 
-        stringstream sstxt, ssdot, ssraw;
-
-        sstxt << base << "som_rev_nfa_" << i << ".txt";
-        ssdot << base << "som_rev_nfa_" << i << ".dot";
-        ssraw << base << "som_nfa_nfa_" << i << ".raw";
-
-        FILE *f;
-
-        f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f, base);
-        fclose(f);
-
-        f = fopen(sstxt.str().c_str(), "w");
-        nfaDumpText(n, f);
-        fclose(f);
+        stringstream ssbase;
+        ssbase << base << "som_rev_nfa_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
 
         if (dump_raw) {
-            f = fopen(ssraw.str().c_str(), "w");
+            stringstream ssraw;
+            ssraw << base << "som_rev_nfa_" << i << ".raw";
+            FILE *f = fopen(ssraw.str().c_str(), "w");
             fwrite(n, 1, n->length, f);
             fclose(f);
         }
@@ -902,20 +989,10 @@ void dumpAnchored(const RoseEngine *t, const string &base) {
 
     while (curr) {
         const NFA *n = (const NFA *)((const char *)curr + sizeof(*curr));
-        stringstream sstxt, ssdot;
 
-        sstxt << base << "anchored_" << i << ".txt";
-        ssdot << base << "anchored_" << i << ".dot";
-
-        FILE *f;
-
-        f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f, base);
-        fclose(f);
-
-        f = fopen(sstxt.str().c_str(), "w");
-        nfaDumpText(n, f);
-        fclose(f);
+        stringstream ssbase;
+        ssbase << base << "anchored_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
 
         curr = curr->next_offset ? (const anchored_matcher_info *)
             ((const char *)curr + curr->next_offset) : nullptr;
@@ -943,6 +1020,63 @@ void dumpAnchoredStats(const void *atable, FILE *f) {
 
 }
 
+static
+void dumpLongLiteralSubtable(const RoseLongLitTable *ll_table,
+                             const RoseLongLitSubtable *ll_sub, FILE *f) {
+    if (!ll_sub->hashBits) {
+        fprintf(f, "      <no table>\n");
+        return;
+    }
+
+    const char *base = (const char *)ll_table;
+
+    u32 nbits = ll_sub->hashBits;
+    u32 num_entries = 1U << nbits;
+    const auto *tab = (const RoseLongLitHashEntry *)(base + ll_sub->hashOffset);
+    u32 hash_occ =
+        count_if(tab, tab + num_entries, [](const RoseLongLitHashEntry &ent) {
+            return ent.str_offset != 0;
+        });
+    float hash_occ_percent = ((float)hash_occ / (float)num_entries) * 100;
+
+    fprintf(f, "      hash table   : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            nbits, hash_occ, num_entries, hash_occ_percent);
+
+    u32 bloom_bits = ll_sub->bloomBits;
+    u32 bloom_size = 1U << bloom_bits;
+    const u8 *bloom = (const u8 *)base + ll_sub->bloomOffset;
+    u32 bloom_occ = accumulate(bloom, bloom + bloom_size / 8, 0,
+        [](const u32 &sum, const u8 &elem) { return sum + popcount32(elem); });
+    float bloom_occ_percent = ((float)bloom_occ / (float)(bloom_size)) * 100;
+
+    fprintf(f, "      bloom filter : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            bloom_bits, bloom_occ, bloom_size, bloom_occ_percent);
+}
+
+static
+void dumpLongLiteralTable(const RoseEngine *t, FILE *f) {
+    if (!t->longLitTableOffset) {
+        return;
+    }
+
+    fprintf(f, "\n");
+    fprintf(f, "Long literal table (streaming):\n");
+
+    const auto *ll_table =
+        (const struct RoseLongLitTable *)loadFromByteCodeOffset(
+            t, t->longLitTableOffset);
+
+    fprintf(f, "    total size     : %u bytes\n", ll_table->size);
+    fprintf(f, "    longest len    : %u\n", ll_table->maxLen);
+    fprintf(f, "    stream state   : %u bytes\n", ll_table->streamStateBytes);
+
+    fprintf(f, "    caseful:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->caseful, f);
+
+    fprintf(f, "    nocase:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->nocase, f);
+}
+
 // Externally accessible functions
 
 void roseDumpText(const RoseEngine *t, FILE *f) {
@@ -1018,7 +1152,7 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
     fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
     fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
     fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
-    fprintf(f, " - floating matcher  : %u bytes\n", t->floatingStreamState);
+    fprintf(f, " - long lit matcher  : %u bytes\n", t->longLitStreamState);
     fprintf(f, " - active array      : %u bytes\n",
             mmbit_size(t->activeArrayCount));
     fprintf(f, " - active rose       : %u bytes\n",
@@ -1072,6 +1206,8 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
         fprintf(f, "\nSmall-block literal matcher stats:\n\n");
         hwlmPrintStats(sbtable, f);
     }
+
+    dumpLongLiteralTable(t, f);
 }
 
 #define DUMP_U8(o, member)                                              \
@@ -1096,8 +1232,10 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, historyRequired);
     DUMP_U32(t, ekeyCount);
     DUMP_U32(t, dkeyCount);
+    DUMP_U32(t, dkeyLogSize);
     DUMP_U32(t, invDkeyOffset);
     DUMP_U32(t, somLocationCount);
+    DUMP_U32(t, somLocationFatbitSize);
     DUMP_U32(t, rolesWithStateCount);
     DUMP_U32(t, stateSize);
     DUMP_U32(t, anchorStateSize);
@@ -1108,6 +1246,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, ematcherOffset);
     DUMP_U32(t, fmatcherOffset);
     DUMP_U32(t, sbmatcherOffset);
+    DUMP_U32(t, longLitTableOffset);
     DUMP_U32(t, amatcherMinWidth);
     DUMP_U32(t, fmatcherMinWidth);
     DUMP_U32(t, eodmatcherMinWidth);
@@ -1121,8 +1260,10 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, activeArrayCount);
     DUMP_U32(t, activeLeftCount);
     DUMP_U32(t, queueCount);
+    DUMP_U32(t, activeQueueArraySize);
     DUMP_U32(t, eagerIterOffset);
     DUMP_U32(t, handledKeyCount);
+    DUMP_U32(t, handledKeyFatbitSize);
     DUMP_U32(t, leftOffset);
     DUMP_U32(t, roseCount);
     DUMP_U32(t, lookaroundTableOffset);
@@ -1143,8 +1284,10 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U64(t, floating_group_mask);
     DUMP_U32(t, size);
     DUMP_U32(t, delay_count);
+    DUMP_U32(t, delay_fatbit_size);
     DUMP_U32(t, delay_base_id);
     DUMP_U32(t, anchored_count);
+    DUMP_U32(t, anchored_fatbit_size);
     DUMP_U32(t, anchored_base_id);
     DUMP_U32(t, maxFloatingDelayedMatch);
     DUMP_U32(t, delayRebuildLength);
@@ -1157,7 +1300,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, stateOffsets.anchorState);
     DUMP_U32(t, stateOffsets.groups);
     DUMP_U32(t, stateOffsets.groups_size);
-    DUMP_U32(t, stateOffsets.floatingMatcherState);
+    DUMP_U32(t, stateOffsets.longLitState);
     DUMP_U32(t, stateOffsets.somLocation);
     DUMP_U32(t, stateOffsets.somValid);
     DUMP_U32(t, stateOffsets.somWritable);
@@ -1176,7 +1319,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, ematcherRegionSize);
     DUMP_U32(t, somRevCount);
     DUMP_U32(t, somRevOffsetOffset);
-    DUMP_U32(t, floatingStreamState);
+    DUMP_U32(t, longLitStreamState);
     fprintf(f, "}\n");
     fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
 }
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index 6abe629b..c3af749f 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -44,11 +44,10 @@
 #include "util/charreach.h"
 #include "util/depth.h"
 #include "util/ue2_containers.h"
+#include "util/ue2_graph.h"
 
 #include <memory>
 #include <set>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/graph_traits.hpp>
 
 namespace ue2 {
 
@@ -139,7 +138,7 @@ struct RoseSuffixInfo {
 /** \brief Properties attached to each Rose graph vertex. */
 struct RoseVertexProps {
     /** \brief Unique dense vertex index. Used for BGL algorithms. */
-    size_t idx = ~size_t{0};
+    size_t index = ~size_t{0};
 
     /** \brief IDs of literals in the Rose literal map. */
     flat_set<u32> literals;
@@ -183,6 +182,9 @@ struct RoseVertexProps {
 /** \brief Properties attached to each Rose graph edge. */
 /* bounds are distance from end of prev to start of the next */
 struct RoseEdgeProps {
+    /** \brief Unique dense vertex index. Used for BGL algorithms. */
+    size_t index = ~size_t{0};
+
     /**
      * \brief Minimum distance from the end of the source role's match to the
      * start of the target role's match.
@@ -215,18 +217,10 @@ bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b);
 
 /**
  * \brief Core Rose graph structure.
- *
- * Note that we use the list selector for the edge and vertex lists: we depend
- * on insertion order for determinism, so we must use these containers.
  */
-using RoseGraph = boost::adjacency_list<boost::listS, // out edge list per vertex
-                                        boost::listS, // vertex list
-                                        boost::bidirectionalS, // bidirectional
-                                        RoseVertexProps, // bundled vertex properties
-                                        RoseEdgeProps, // bundled edge properties
-                                        boost::listS // graph edge list
-                                        >;
-
+struct RoseGraph : public ue2_graph<RoseGraph, RoseVertexProps, RoseEdgeProps> {
+    friend class RoseBuildImpl; /* to allow index renumbering */
+};
 using RoseVertex = RoseGraph::vertex_descriptor;
 using RoseEdge = RoseGraph::edge_descriptor;
 
diff --git a/src/rose/rose_in_dump.cpp b/src/rose/rose_in_dump.cpp
index fbd6858b..172b58e8 100644
--- a/src/rose/rose_in_dump.cpp
+++ b/src/rose/rose_in_dump.cpp
@@ -122,7 +122,7 @@ void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
 
         ostringstream name;
         name << grey.dumpPath << "pre_rose_" << id << ".dot";
-        dumpGraph(name.str().c_str(), h->g);
+        dumpGraph(name.str().c_str(), *h);
         assert(allMatchStatesHaveReports(*h));
     }
 
diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h
index 14d4d9b2..0e218576 100644
--- a/src/rose/rose_in_graph.h
+++ b/src/rose/rose_in_graph.h
@@ -46,13 +46,11 @@
 #include "ue2common.h"
 #include "rose/rose_common.h"
 #include "util/ue2_containers.h"
+#include "util/ue2_graph.h"
 #include "util/ue2string.h"
 
 #include <memory>
 
-#include <boost/graph/graph_traits.hpp>
-#include <boost/graph/adjacency_list.hpp>
-
 namespace ue2 {
 
 class NGHolder;
@@ -128,6 +126,7 @@ public:
     flat_set<ReportID> reports; /**< for RIV_ACCEPT/RIV_ACCEPT_EOD */
     u32 min_offset; /**< Minimum offset at which this vertex can match. */
     u32 max_offset; /**< Maximum offset at which this vertex can match. */
+    size_t index = 0;
 };
 
 struct RoseInEdgeProps {
@@ -174,11 +173,12 @@ struct RoseInEdgeProps {
     std::shared_ptr<raw_som_dfa> haig;
 
     u32 graph_lag;
+    size_t index = 0;
 };
 
-typedef boost::adjacency_list<boost::listS, boost::listS, boost::bidirectionalS,
-                              RoseInVertexProps,
-                              RoseInEdgeProps> RoseInGraph;
+struct RoseInGraph
+    : public ue2_graph<RoseInGraph, RoseInVertexProps, RoseInEdgeProps> {
+};
 typedef RoseInGraph::vertex_descriptor RoseInVertex;
 typedef RoseInGraph::edge_descriptor RoseInEdge;
 
diff --git a/src/rose/rose_in_util.cpp b/src/rose/rose_in_util.cpp
index cce6ff35..3b31b38e 100644
--- a/src/rose/rose_in_util.cpp
+++ b/src/rose/rose_in_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,27 +48,15 @@ using namespace std;
 
 namespace ue2 {
 
-static
-void populateIndexMap(const RoseInGraph &in,
-                      map<RoseInVertex, size_t> *index_map) {
-    size_t i = 0;
-    for (auto v : vertices_range(in)) {
-        (*index_map)[v] = i++;
-    }
-}
-
 /* Returns a topological ordering of the vertices in g. That is the starts are
  * at the front and all the predecessors of a vertex occur earlier in the list
  * than the vertex. */
 vector<RoseInVertex> topo_order(const RoseInGraph &g) {
-    map<RoseInVertex, size_t> index_map;
-    populateIndexMap(g, &index_map);
-
+    assert(hasCorrectlyNumberedVertices(g));
     vector<RoseInVertex> v_order;
-    v_order.reserve(index_map.size());
+    v_order.reserve(num_vertices(g));
 
-    topological_sort(g, back_inserter(v_order),
-        vertex_index_map(boost::make_assoc_property_map(index_map)));
+    boost::topological_sort(g, back_inserter(v_order));
 
     reverse(v_order.begin(), v_order.end()); /* put starts at the front */
 
@@ -105,6 +93,7 @@ private:
 }
 
 unique_ptr<RoseInGraph> cloneRoseGraph(const RoseInGraph &ig) {
+    assert(hasCorrectlyNumberedVertices(ig));
     unique_ptr<RoseInGraph> out = make_unique<RoseInGraph>();
 
     unordered_map<const NGHolder *, shared_ptr<NGHolder>> graph_map;
@@ -120,12 +109,8 @@ unique_ptr<RoseInGraph> cloneRoseGraph(const RoseInGraph &ig) {
         }
     }
 
-    map<RoseInVertex, size_t> index_map;
-    populateIndexMap(ig, &index_map);
-
     copy_graph(ig, *out,
-               boost::edge_copy(RoseEdgeCopier(ig, *out, graph_map, haig_map))
-                   .vertex_index_map(boost::make_assoc_property_map(index_map)));
+               boost::edge_copy(RoseEdgeCopier(ig, *out, graph_map, haig_map)));
     return out;
 }
 
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 51913984..411ce03f 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -217,8 +217,8 @@ struct RoseStateOffsets {
     /** Size of packed Rose groups value, in bytes. */
     u32 groups_size;
 
-    /** State for floating literal matcher (managed by HWLM). */
-    u32 floatingMatcherState;
+    /** State for long literal support. */
+    u32 longLitState;
 
     /** Packed SOM location slots. */
     u32 somLocation;
@@ -309,9 +309,11 @@ struct RoseEngine {
     u32 historyRequired; /**< max amount of history required for streaming */
     u32 ekeyCount; /**< number of exhaustion keys */
     u32 dkeyCount; /**< number of dedupe keys */
+    u32 dkeyLogSize; /**< size of fatbit for storing dkey log (bytes) */
     u32 invDkeyOffset; /**< offset to table mapping from dkeys to the external
                          *  report ids */
     u32 somLocationCount; /**< number of som locations required */
+    u32 somLocationFatbitSize; /**< size of SOM location fatbit (bytes) */
     u32 rolesWithStateCount; // number of roles with entries in state bitset
     u32 stateSize; /* size of the state bitset
                     * WARNING: not the size of the rose state */
@@ -325,6 +327,7 @@ struct RoseEngine {
     u32 ematcherOffset; // offset of the eod-anchored literal matcher (bytes)
     u32 fmatcherOffset; // offset of the floating literal matcher (bytes)
     u32 sbmatcherOffset; // offset of the small-block literal matcher (bytes)
+    u32 longLitTableOffset; // offset of the long literal table
     u32 amatcherMinWidth; /**< minimum number of bytes required for a pattern
                            * involved with the anchored table to produce a full
                            * match. */
@@ -369,14 +372,18 @@ struct RoseEngine {
     u32 activeArrayCount; //number of nfas tracked in the active array
     u32 activeLeftCount; //number of nfas tracked in the active rose array
     u32 queueCount;      /**< number of nfa queues */
+    u32 activeQueueArraySize; //!< size of fatbit for active queues (bytes)
 
     u32 eagerIterOffset; /**< offset to sparse iter for eager prefixes or 0 if
                           * none */
 
     /** \brief Number of keys used by CHECK_SET_HANDLED instructions in role
-     * programs. Used to size the handled_roles fatbit in scratch. */
+     * programs. */
     u32 handledKeyCount;
 
+    /** \brief Size of the handled keys fatbit in scratch (bytes). */
+    u32 handledKeyFatbitSize;
+
     u32 leftOffset;
     u32 roseCount;
     u32 lookaroundTableOffset; //!< base of lookaround offset list (of s8 values)
@@ -411,9 +418,11 @@ struct RoseEngine {
     rose_group floating_group_mask; /* groups that are used by the ftable */
     u32 size; // (bytes)
     u32 delay_count; /* number of delayed literal ids. */
+    u32 delay_fatbit_size; //!< size of each delay fatbit in scratch (bytes)
     u32 delay_base_id; /* literal id of the first delayed literal.
                         * delayed literal ids are contiguous */
     u32 anchored_count; /* number of anchored literal ids */
+    u32 anchored_fatbit_size; //!< size of each anch fatbit in scratch (bytes)
     u32 anchored_base_id; /* literal id of the first literal in the A table.
                            * anchored literal ids are contiguous */
     u32 maxFloatingDelayedMatch; /* max offset that a delayed literal can
@@ -434,7 +443,7 @@ struct RoseEngine {
     u32 ematcherRegionSize; /* max region size to pass to ematcher */
     u32 somRevCount; /**< number of som reverse nfas */
     u32 somRevOffsetOffset; /**< offset to array of offsets to som rev nfas */
-    u32 floatingStreamState; // size in bytes
+    u32 longLitStreamState; // size in bytes
 
     struct scatter_full_plan state_init;
 };
@@ -445,6 +454,72 @@ struct ALIGN_CL_DIRECTIVE anchored_matcher_info {
     u32 anchoredMinDistance; /* start of region to run anchored table over */
 };
 
+/**
+ * \brief Long literal subtable for a particular mode (caseful or nocase).
+ */
+struct RoseLongLitSubtable {
+    /**
+     * \brief Offset of the hash table (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 hashOffset;
+
+    /**
+     * \brief Offset of the bloom filter (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 bloomOffset;
+
+    /** \brief lg2 of the size of the hash table. */
+    u8 hashBits;
+
+    /** \brief Size of the bloom filter in bits. */
+    u8 bloomBits;
+
+    /** \brief Number of bits of packed stream state used.  */
+    u8 streamStateBits;
+};
+
+/**
+ * \brief Long literal table header.
+ */
+struct RoseLongLitTable {
+    /**
+     * \brief Total size of the whole table (including strings, bloom filters,
+     * hash tables).
+     */
+    u32 size;
+
+    /** \brief Caseful sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable caseful;
+
+    /** \brief Caseless sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable nocase;
+
+    /** \brief Total size of packed stream state in bytes. */
+    u8 streamStateBytes;
+
+    /** \brief Max length of literal prefixes. */
+    u8 maxLen;
+};
+
+/**
+ * \brief One of these structures per hash table entry in our long literal
+ * table.
+ */
+struct RoseLongLitHashEntry {
+    /**
+     * \brief Offset of the literal string itself, relative to
+     * RoseLongLitTable base. Zero if this bucket is empty.
+     */
+    u32 str_offset;
+
+    /** \brief Length of the literal string. */
+    u32 str_len;
+};
+
 static really_inline
 const struct anchored_matcher_info *getALiteralMatcher(
         const struct RoseEngine *t) {
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 545e190f..ed913316 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -42,16 +42,22 @@
 
 /** \brief Role program instruction opcodes. */
 enum RoseInstructionCode {
+    ROSE_INSTR_END,               //!< End of program.
     ROSE_INSTR_ANCHORED_DELAY,    //!< Delay until after anchored matcher.
-    ROSE_INSTR_CHECK_LIT_MASK,    //!< Check and/cmp mask.
     ROSE_INSTR_CHECK_LIT_EARLY,   //!< Skip matches before floating min offset.
     ROSE_INSTR_CHECK_GROUPS,      //!< Check that literal groups are on.
     ROSE_INSTR_CHECK_ONLY_EOD,    //!< Role matches only at EOD.
     ROSE_INSTR_CHECK_BOUNDS,      //!< Bounds on distance from offset 0.
     ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled".
+    ROSE_INSTR_CHECK_SINGLE_LOOKAROUND, //!< Single lookaround check.
     ROSE_INSTR_CHECK_LOOKAROUND,  //!< Lookaround check.
     ROSE_INSTR_CHECK_MASK,        //!< 8-bytes mask check.
+    ROSE_INSTR_CHECK_MASK_32,     //!< 32-bytes and/cmp/neg mask check.
     ROSE_INSTR_CHECK_BYTE,        //!< Single Byte check.
+    ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti.
     ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
     ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
     ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
@@ -99,6 +105,7 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_STATE,       //!< Test a single bit in the state multibit.
     ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states.
     ROSE_INSTR_SPARSE_ITER_NEXT,  //!< Continue running sparse iter over states.
+    ROSE_INSTR_SPARSE_ITER_ANY,   //!< Test for any bit in the sparse iterator.
 
     /** \brief Check outfixes and suffixes for EOD and fire reports if so. */
     ROSE_INSTR_ENGINES_EOD,
@@ -110,7 +117,23 @@ enum RoseInstructionCode {
     /** \brief Run the EOD-anchored HWLM literal matcher. */
     ROSE_INSTR_MATCHER_EOD,
 
-    ROSE_INSTR_END                //!< End of program.
+    /**
+     * \brief Confirm a case-sensitive literal at the current offset. In
+     * streaming mode, this makes use of the long literal table.
+     */
+    ROSE_INSTR_CHECK_LONG_LIT,
+
+    /**
+     * \brief Confirm a case-insensitive literal at the current offset. In
+     * streaming mode, this makes use of the long literal table.
+     */
+    ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_LONG_LIT_NOCASE //!< Sentinel.
+};
+
+struct ROSE_STRUCT_END {
+    u8 code; //!< From enum RoseInstructionCode.
 };
 
 struct ROSE_STRUCT_ANCHORED_DELAY {
@@ -119,18 +142,6 @@ struct ROSE_STRUCT_ANCHORED_DELAY {
     u32 done_jump; //!< Jump forward this many bytes if successful.
 };
 
-union RoseLiteralMask {
-    u64a a64[MAX_MASK2_WIDTH / sizeof(u64a)];
-    u8 a8[MAX_MASK2_WIDTH];
-};
-
-/** Note: check failure will halt program. */
-struct ROSE_STRUCT_CHECK_LIT_MASK {
-    u8 code; //!< From enum RoseInstructionCode.
-    union RoseLiteralMask and_mask;
-    union RoseLiteralMask cmp_mask;
-};
-
 /** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LIT_EARLY {
     u8 code; //!< From enum RoseInstructionCode.
@@ -161,6 +172,13 @@ struct ROSE_STRUCT_CHECK_NOT_HANDLED {
     u32 fail_jump; //!< Jump forward this many bytes if we have seen key before.
 };
 
+struct ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    s8 offset; //!< The offset of the byte to examine.
+    u32 reach_index; //!< The index of the reach table entry to use.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_LOOKAROUND {
     u8 code; //!< From enum RoseInstructionCode.
     u32 index;
@@ -170,9 +188,18 @@ struct ROSE_STRUCT_CHECK_LOOKAROUND {
 
 struct ROSE_STRUCT_CHECK_MASK {
     u8 code; //!< From enum roseInstructionCode.
-    u64a and_mask; //!< 64-bits and mask.
-    u64a cmp_mask; //!< 64-bits cmp mask.
-    u64a neg_mask; //!< 64-bits negation mask.
+    u64a and_mask; //!< 8-byte and mask.
+    u64a cmp_mask; //!< 8-byte cmp mask.
+    u64a neg_mask; //!< 8-byte negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MASK_32 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask[32]; //!< 32-byte and mask.
+    u8 cmp_mask[32]; //!< 32-byte cmp mask.
+    u32 neg_mask; //!< negation mask with 32 bits.
     s32 offset; //!< Relative offset of the first byte.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
@@ -186,6 +213,48 @@ struct ROSE_STRUCT_CHECK_BYTE {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+// Since m128 and m256 could be missaligned in the bytecode,
+// we'll use u8[16] and u8[32] instead in all rose_check_shufti structures.
+struct ROSE_STRUCT_CHECK_SHUFTI_16x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 nib_mask[32]; //!< High 16 and low 16 bits nibble mask in shufti.
+    u8 bucket_select_mask[16]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< Negation mask in low 16 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_32x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[16]; //!< High nibble mask in shufti.
+    u8 lo_mask[16]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[32]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< 32 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_16x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[32]; //!< High nibble mask in shufti.
+    u8 lo_mask[32]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[32]; //!< Mask for bucket assigning.
+    u32 neg_mask; //!< Negation mask in low 16 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_32x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[32]; //!< High nibble mask in shufti.
+    u8 lo_mask[32]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask_hi[32]; //!< Bucket mask for high 8 buckets.
+    u8 bucket_select_mask_lo[32]; //!< Bucket mask for low 8 buckets.
+    u32 neg_mask; //!< 32 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_INFIX {
     u8 code; //!< From enum RoseInstructionCode.
     u32 queue; //!< Queue of leftfix to check.
@@ -389,6 +458,12 @@ struct ROSE_STRUCT_SPARSE_ITER_NEXT {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_SPARSE_ITER_ANY {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_ENGINES_EOD {
     u8 code; //!< From enum RoseInstructionCode.
     u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
@@ -402,8 +477,18 @@ struct ROSE_STRUCT_MATCHER_EOD {
     u8 code; //!< From enum RoseInstructionCode.
 };
 
-struct ROSE_STRUCT_END {
+/** Note: check failure will halt program. */
+struct ROSE_STRUCT_CHECK_LONG_LIT {
     u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+};
+
+/** Note: check failure will halt program. */
+struct ROSE_STRUCT_CHECK_LONG_LIT_NOCASE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
 };
 
 #endif // ROSE_ROSE_PROGRAM_H
diff --git a/src/rose/runtime.h b/src/rose/runtime.h
index 60c7d34b..d2a4b5d7 100644
--- a/src/rose/runtime.h
+++ b/src/rose/runtime.h
@@ -97,8 +97,8 @@ void storeGroups(const struct RoseEngine *t, char *state, rose_group groups) {
 }
 
 static really_inline
-u8 *getFloatingMatcherState(const struct RoseEngine *t, char *state) {
-    return (u8 *)(state + t->stateOffsets.floatingMatcherState);
+u8 *getLongLitState(const struct RoseEngine *t, char *state) {
+    return (u8 *)(state + t->stateOffsets.longLitState);
 }
 
 static really_inline
diff --git a/src/rose/stream.c b/src/rose/stream.c
index b934f98f..9599612f 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -33,6 +33,8 @@
 #include "miracle.h"
 #include "program_runtime.h"
 #include "rose.h"
+#include "rose_internal.h"
+#include "stream_long_lit.h"
 #include "hwlm/hwlm.h"
 #include "nfa/mcclellan.h"
 #include "nfa/nfa_api.h"
@@ -406,6 +408,7 @@ void ensureStreamNeatAndTidy(const struct RoseEngine *t, char *state,
     roseFlushLastByteHistory(t, scratch, offset + length);
     tctxt->lastEndOffset = offset + length;
     storeGroups(t, state, tctxt->groups);
+    storeLongLiteralState(t, state, scratch);
 }
 
 static really_inline
@@ -548,6 +551,7 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     tctxt->minMatchOffset = offset;
     tctxt->minNonMpvMatchOffset = offset;
     tctxt->next_mpv_offset = 0;
+
     DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n",
                  scratch->core_info.hlen, scratch->core_info.len, tctxt->groups);
 
@@ -576,6 +580,12 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
 
     const struct HWLM *ftable = getFLiteralMatcher(t);
     if (ftable) {
+        // Load in long literal table state and set up "fake history" buffers
+        // (ll_buf, etc, used by the CHECK_LONG_LIT instruction). Note that this
+        // must be done here in order to ensure that it happens before any path
+        // that leads to storeLongLiteralState(), which relies on these buffers.
+        loadLongLiteralState(t, state, scratch);
+
         if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
             DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
             goto flush_delay_and_exit;
@@ -621,17 +631,9 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         }
         DEBUG_PRINTF("start=%zu\n", start);
 
-        u8 *stream_state;
-        if (t->floatingStreamState) {
-            stream_state = getFloatingMatcherState(t, state);
-        } else {
-            stream_state = NULL;
-        }
-
         DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
         hwlmExecStreaming(ftable, scratch, flen, start, roseFloatingCallback,
-                          scratch, tctxt->groups & t->floating_group_mask,
-                          stream_state);
+                          scratch, tctxt->groups & t->floating_group_mask);
     }
 
 flush_delay_and_exit:
diff --git a/src/rose/stream_long_lit.h b/src/rose/stream_long_lit.h
new file mode 100644
index 00000000..d78e2863
--- /dev/null
+++ b/src/rose/stream_long_lit.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef STREAM_LONG_LIT_H
+#define STREAM_LONG_LIT_H
+
+#include "rose.h"
+#include "rose_common.h"
+#include "rose_internal.h"
+#include "stream_long_lit_hash.h"
+#include "util/copybytes.h"
+
+static really_inline
+const struct RoseLongLitHashEntry *
+getHashTableBase(const struct RoseLongLitTable *ll_table,
+                 const struct RoseLongLitSubtable *ll_sub) {
+    assert(ll_sub->hashOffset);
+    return (const struct RoseLongLitHashEntry *)((const char *)ll_table +
+                                                 ll_sub->hashOffset);
+}
+
+// Reads from stream state and unpacks values into stream state table.
+static really_inline
+void loadLongLitStreamState(const struct RoseLongLitTable *ll_table,
+                            const u8 *ll_state, u32 *state_case,
+                            u32 *state_nocase) {
+    assert(ll_table);
+    assert(ll_state);
+    assert(state_case && state_nocase);
+
+    u8 ss_bytes = ll_table->streamStateBytes;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 ssb_mask = (1U << ssb) - 1;
+        u32 streamVal = partial_load_u32(ll_state, ss_bytes);
+        *state_case = (u32)(streamVal & ssb_mask);
+        *state_nocase = (u32)(streamVal >> ssb);
+        return;
+    }
+#endif
+
+    u64a ssb_mask = (1ULL << ssb) - 1;
+    u64a streamVal = partial_load_u64a(ll_state, ss_bytes);
+    *state_case = (u32)(streamVal & ssb_mask);
+    *state_nocase = (u32)(streamVal >> ssb);
+}
+
+static rose_inline
+void loadLongLiteralStateMode(struct hs_scratch *scratch,
+                              const struct RoseLongLitTable *ll_table,
+                              const struct RoseLongLitSubtable *ll_sub,
+                              const u32 state, const char nocase) {
+    if (!state) {
+        DEBUG_PRINTF("no state for %s\n", nocase ? "caseless" : "caseful");
+        return;
+    }
+
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+    const struct RoseLongLitHashEntry *ent = tab + state - 1;
+
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *found_buf = (const u8 *)ll_table + ent->str_offset;
+    size_t found_sz = ent->str_len;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (nocase) {
+        tctxt->ll_buf_nocase = found_buf;
+        tctxt->ll_len_nocase = found_sz;
+    } else {
+        tctxt->ll_buf = found_buf;
+        tctxt->ll_len = found_sz;
+    }
+}
+
+static rose_inline
+void loadLongLiteralState(const struct RoseEngine *t, char *state,
+                          struct hs_scratch *scratch) {
+    if (!t->longLitTableOffset) {
+        return;
+    }
+
+    // If we don't have any long literals in play, these values must point to
+    // the real history buffer so that CHECK_LITERAL instructions examine the
+    // history buffer.
+    scratch->tctxt.ll_buf = scratch->core_info.hbuf;
+    scratch->tctxt.ll_len = scratch->core_info.hlen;
+    scratch->tctxt.ll_buf_nocase = scratch->core_info.hbuf;
+    scratch->tctxt.ll_len_nocase = scratch->core_info.hlen;
+
+    if (!scratch->core_info.hlen) {
+        return;
+    }
+
+    const struct RoseLongLitTable *ll_table =
+        getByOffset(t, t->longLitTableOffset);
+    const u8 *ll_state = getLongLitState(t, state);
+
+    u32 state_case;
+    u32 state_nocase;
+    loadLongLitStreamState(ll_table, ll_state, &state_case, &state_nocase);
+
+    DEBUG_PRINTF("loaded {%u, %u}\n", state_case, state_nocase);
+
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->caseful,
+                             state_case, 0);
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->nocase,
+                             state_nocase, 1);
+}
+
+static rose_inline
+char confirmLongLiteral(const struct RoseLongLitTable *ll_table,
+                        const struct hs_scratch *scratch,
+                        const struct RoseLongLitHashEntry *ent,
+                        const char nocase) {
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *s = (const u8 *)ll_table + ent->str_offset;
+    size_t len = ent->str_len;
+    const u8 *buf = scratch->core_info.buf;
+    const size_t buf_len = scratch->core_info.len;
+
+    if (len > buf_len) {
+        const struct RoseContext *tctxt = &scratch->tctxt;
+        const u8 *hist = nocase ? tctxt->ll_buf_nocase : tctxt->ll_buf;
+        size_t hist_len = nocase ? tctxt->ll_len_nocase : tctxt->ll_len;
+
+        if (len > buf_len + hist_len) {
+            return 0; // Break out - not enough total history
+        }
+
+        size_t overhang = len - buf_len;
+        assert(overhang <= hist_len);
+
+        if (cmpForward(hist + hist_len - overhang, s, overhang, nocase)) {
+            return 0;
+        }
+        s += overhang;
+        len -= overhang;
+    }
+
+    // if we got here, we don't need history or we compared ok out of history
+    assert(len <= buf_len);
+
+    if (cmpForward(buf + buf_len - len, s, len, nocase)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static rose_inline
+const u8 *prepScanBuffer(const struct core_info *ci,
+                         const struct RoseLongLitTable *ll_table, u8 *tempbuf) {
+    const u8 hash_len = ll_table->maxLen;
+    assert(hash_len >= LONG_LIT_HASH_LEN);
+
+    // Our hash function operates over LONG_LIT_HASH_LEN bytes, starting from
+    // location (end of buffer - hash_len). If this block can be satisfied
+    // entirely from either the current buffer or the history buffer, we pass
+    // in the pointer directly; otherwise we must make a copy.
+
+    const u8 *base;
+
+    if (hash_len > ci->len) {
+        size_t overhang = hash_len - ci->len;
+        if (overhang >= LONG_LIT_HASH_LEN) {
+            // Can read enough to hash from inside the history buffer.
+            assert(overhang <= ci->hlen);
+            base = ci->hbuf + ci->hlen - overhang;
+        } else {
+            // Copy: first chunk from history buffer.
+            assert(overhang <= ci->hlen);
+            copy_upto_32_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
+                               overhang);
+            // Copy: second chunk from current buffer.
+            size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang;
+            assert(copy_buf_len <= ci->len);
+            copy_upto_32_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
+            // Read from our temporary buffer for the hash.
+            base = tempbuf;
+        }
+    } else {
+        // Can read enough to hash from inside the current buffer.
+        base = ci->buf + ci->len - hash_len;
+    }
+
+    return base;
+}
+
+#ifndef NDEBUG
+// Defensive checking (used in assert) that these table values don't overflow
+// the range available.
+static really_inline
+char streamingTableOverflow(u32 state_case, u32 state_nocase, u8 ssb,
+                            u8 ssb_nc) {
+    u32 ssb_mask = (1ULL << (ssb)) - 1;
+    if (state_case & ~ssb_mask) {
+        return 1;
+    }
+    u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1;
+    if (state_nocase & ~ssb_nc_mask) {
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// Reads from stream state table and packs values into stream state.
+static rose_inline
+void storeLongLitStreamState(const struct RoseLongLitTable *ll_table,
+                             u8 *ll_state, u32 state_case, u32 state_nocase) {
+    assert(ll_table);
+    assert(ll_state);
+
+    u8 ss_bytes = ll_table->streamStateBytes;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
+    assert(ss_bytes == ROUNDUP_N(ssb + ssb_nc, 8) / 8);
+    assert(!streamingTableOverflow(state_case, state_nocase, ssb, ssb_nc));
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 stagingStreamState = state_case;
+        stagingStreamState |= (state_nocase << ssb);
+        partial_store_u32(ll_state, stagingStreamState, ss_bytes);
+        return;
+    }
+#endif
+
+    u64a stagingStreamState = (u64a)state_case;
+    stagingStreamState |= (u64a)state_nocase << ssb;
+    partial_store_u64a(ll_state, stagingStreamState, ss_bytes);
+}
+
+static really_inline
+char has_bit(const u8 *data, u32 bit) {
+    return (data[bit / 8] >> (bit % 8)) & 1;
+}
+
+static rose_inline
+char bloomHasKey(const u8 *bloom, u32 bloom_mask, u32 hash) {
+    return has_bit(bloom, hash & bloom_mask);
+}
+
+static rose_inline
+char checkBloomFilter(const struct RoseLongLitTable *ll_table,
+                      const struct RoseLongLitSubtable *ll_sub,
+                      const u8 *scan_buf, char nocase) {
+    assert(ll_sub->bloomBits);
+
+    const u8 *bloom = (const u8 *)ll_table + ll_sub->bloomOffset;
+    const u32 bloom_mask = (1U << ll_sub->bloomBits) - 1;
+
+    char v = 1;
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_1(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_2(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_3(scan_buf, nocase));
+    return v;
+}
+
+/**
+ * \brief Look for a hit in the hash table.
+ *
+ * Returns zero if not found, otherwise returns (bucket + 1).
+ */
+static rose_inline
+u32 checkHashTable(const struct RoseLongLitTable *ll_table,
+                   const struct RoseLongLitSubtable *ll_sub, const u8 *scan_buf,
+                   const struct hs_scratch *scratch, char nocase) {
+    const u32 nbits = ll_sub->hashBits;
+    assert(nbits && nbits < 32);
+    const u32 num_entries = 1U << nbits;
+
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+
+    u32 hash = hashLongLiteral(scan_buf, LONG_LIT_HASH_LEN, nocase);
+    u32 bucket = hash & ((1U << nbits) - 1);
+
+    while (tab[bucket].str_offset != 0) {
+        DEBUG_PRINTF("checking bucket %u\n", bucket);
+        if (confirmLongLiteral(ll_table, scratch, &tab[bucket], nocase)) {
+            DEBUG_PRINTF("found hit for bucket %u\n", bucket);
+            return bucket + 1;
+        }
+
+        if (++bucket == num_entries) {
+            bucket = 0;
+        }
+    }
+
+    return 0;
+}
+
+static rose_inline
+void storeLongLiteralState(const struct RoseEngine *t, char *state,
+                           struct hs_scratch *scratch) {
+    if (!t->longLitTableOffset) {
+        DEBUG_PRINTF("no table\n");
+        return;
+    }
+
+    struct core_info *ci = &scratch->core_info;
+    const struct RoseLongLitTable *ll_table =
+        getByOffset(t, t->longLitTableOffset);
+    assert(ll_table->maxLen);
+
+    DEBUG_PRINTF("maxLen=%u, len=%zu, hlen=%zu\n", ll_table->maxLen, ci->len,
+                 ci->hlen);
+
+    u32 state_case = 0;
+    u32 state_nocase = 0;
+
+    // If we don't have enough history, we don't need to do anything.
+    if (ll_table->maxLen <= ci->len + ci->hlen) {
+        u8 tempbuf[LONG_LIT_HASH_LEN];
+        const u8 *scan_buf = prepScanBuffer(ci, ll_table, tempbuf);
+
+        if (ll_table->caseful.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->caseful, scan_buf, 0)) {
+            state_case = checkHashTable(ll_table, &ll_table->caseful, scan_buf,
+                                        scratch, 0);
+        }
+
+        if (ll_table->nocase.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->nocase, scan_buf, 1)) {
+            state_nocase = checkHashTable(ll_table, &ll_table->nocase, scan_buf,
+                                          scratch, 1);
+        }
+    } else {
+        DEBUG_PRINTF("not enough history (%zu bytes)\n", ci->len + ci->hlen);
+    }
+
+    DEBUG_PRINTF("store {%u, %u}\n", state_case, state_nocase);
+
+    u8 *ll_state = getLongLitState(t, state);
+    storeLongLitStreamState(ll_table, ll_state, state_case, state_nocase);
+}
+
+#endif // STREAM_LONG_LIT_H
diff --git a/src/rose/stream_long_lit_hash.h b/src/rose/stream_long_lit_hash.h
new file mode 100644
index 00000000..041f05e6
--- /dev/null
+++ b/src/rose/stream_long_lit_hash.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef STREAM_LONG_LIT_HASH_H
+#define STREAM_LONG_LIT_HASH_H
+
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+
+/** \brief Length of the buffer operated on by \ref hashLongLiteral(). */
+#define LONG_LIT_HASH_LEN 24
+
+/** \brief Multiplier used by al the hash functions below. */
+#define HASH_MULTIPLIER 0x0b4e0ef37bc32127ULL
+
+/** \brief Hash function used for long literal table in streaming mode. */
+static really_inline
+u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
+    // We unconditionally hash LONG_LIT_HASH_LEN bytes; all use cases of this
+    // hash are for strings longer than this.
+    assert(len >= 24);
+
+    u64a v1 = unaligned_load_u64a(ptr);
+    u64a v2 = unaligned_load_u64a(ptr + 8);
+    u64a v3 = unaligned_load_u64a(ptr + 16);
+    if (nocase) {
+        v1 &= OCTO_CASE_CLEAR;
+        v2 &= OCTO_CASE_CLEAR;
+        v3 &= OCTO_CASE_CLEAR;
+    }
+    v1 *= HASH_MULTIPLIER;
+    v2 *= HASH_MULTIPLIER * HASH_MULTIPLIER;
+    v3 *= HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
+    v1 >>= 32;
+    v2 >>= 32;
+    v3 >>= 32;
+    return v1 ^ v2 ^ v3;
+}
+
+/**
+ * \brief Internal, used by the bloom filter hash functions below. Hashes 16
+ * bytes beginning at (ptr + offset).
+ */
+static really_inline
+u32 bloomHash_i(const u8 *ptr, u32 offset, u64a multiplier, char nocase) {
+    assert(offset + 16 <= LONG_LIT_HASH_LEN);
+
+    u64a v = unaligned_load_u64a(ptr + offset);
+    if (nocase) {
+        v &= OCTO_CASE_CLEAR;
+    }
+    v *= multiplier;
+    return v >> 32;
+}
+
+/*
+ * We ensure that we see every byte the first LONG_LIT_HASH_LEN bytes of input
+ * data (using at least one of the following functions).
+ */
+
+static really_inline
+u32 bloomHash_1(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 0, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_2(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 4, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_3(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 8, multiplier, nocase);
+}
+
+#endif // STREAM_LONG_LIT_HASH_H
diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h
index b2c2f5d6..ac8cc312 100644
--- a/src/rose/validate_mask.h
+++ b/src/rose/validate_mask.h
@@ -26,7 +26,22 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef VALIDATE_MASK_H
+#define VALIDATE_MASK_H
+
 #include "ue2common.h"
+#include "util/simd_utils.h"
+
+#if defined(DEBUG)
+static
+void validateMask32Print(const u8 *mask) {
+    int i;
+    for (i = 0; i < 32; i++) {
+        printf("%02x", mask[i]);
+    }
+    printf("\n");
+}
+#endif
 
 // check positive bytes in cmp_result.
 // return one if the check passed, zero otherwise.
@@ -75,3 +90,29 @@ int validateMask(u64a data, u64a valid_data_mask, u64a and_mask,
         return 0;
     }
 }
+
+static really_inline
+int validateMask32(const m256 data, const u32 valid_data_mask,
+                   const m256 and_mask, const m256 cmp_mask,
+                   const u32 neg_mask) {
+    m256 cmp_result_256 = eq256(and256(data, and_mask), cmp_mask);
+    u32 cmp_result = ~movemask256(cmp_result_256);
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    validateMask32Print((const u8 *)&data);
+    DEBUG_PRINTF("cmp_result\n");
+    validateMask32Print((const u8 *)&cmp_result_256);
+#endif
+    DEBUG_PRINTF("cmp_result %08x neg_mask %08x\n", cmp_result, neg_mask);
+    DEBUG_PRINTF("valid_data_mask %08x\n", valid_data_mask);
+
+    if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
+        DEBUG_PRINTF("checkCompareResult32 passed\n");
+        return 1;
+    } else {
+        DEBUG_PRINTF("checkCompareResult32 failed\n");
+        return 0;
+    }
+}
+
+#endif
diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h
new file mode 100644
index 00000000..49d2c2fe
--- /dev/null
+++ b/src/rose/validate_shufti.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VALIDATE_SHUFTI_H
+#define VALIDATE_SHUFTI_H
+
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#if defined(DEBUG)
+static
+void dumpMask(const void *mask, int len) {
+    const u8 *c = (const u8 *)mask;
+    for (int i = 0; i < len; i++) {
+        printf("%02x", c[i]);
+    }
+    printf("\n");
+}
+#endif
+
+static really_inline
+int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
+                            const m256 lo_mask, const m256 and_mask,
+                            const u32 neg_mask, const u16 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
+    m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
+    m256 t = and256(c_lo, c_hi);
+    u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 32);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 32);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 32);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 32);
+    DEBUG_PRINTF("and_mask\n");
+    dumpMask(&and_mask, 32);
+    DEBUG_PRINTF("nresult %x\n", nresult);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (((nresult >> 16) & nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
+                           const m128 and_mask, const u32 neg_mask,
+                           const u16 valid_data_mask) {
+    m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
+    m256 low4bits = set32x8(0xf);
+    m256 c_nib = vpshufb(nib_mask, and256(data_m256, low4bits));
+    m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
+    m128 nresult = eq128(and128(t, and_mask), zeroes128());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data_m256, 32);
+    DEBUG_PRINTF("nib_mask\n");
+    dumpMask(&nib_mask, 32);
+    DEBUG_PRINTF("c_nib\n");
+    dumpMask(&c_nib, 32);
+    DEBUG_PRINTF("nresult\n");
+    dumpMask(&nresult, 16);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (movemask128(nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
+                           const m256 lo_mask, const m256 and_mask,
+                           const u32 neg_mask, const u32 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
+    m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
+    m256 t = and256(c_lo, c_hi);
+    m256 nresult = eq256(and256(t, and_mask), zeroes256());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 32);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 32);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 32);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 32);
+    DEBUG_PRINTF("nresult\n");
+    dumpMask(&nresult, 32);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (movemask256(nresult) ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask32x16(const m256 data,
+                            const m256 hi_mask_1, const m256 hi_mask_2,
+                            const m256 lo_mask_1, const m256 lo_mask_2,
+                            const m256 bucket_mask_hi,
+                            const m256 bucket_mask_lo, const u32 neg_mask,
+                            const u32 valid_data_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo_1 = vpshufb(lo_mask_1, data_lo);
+    m256 c_lo_2 = vpshufb(lo_mask_2, data_lo);
+    m256 c_hi_1 = vpshufb(hi_mask_1, data_hi);
+    m256 c_hi_2 = vpshufb(hi_mask_2, data_hi);
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 32);
+    DEBUG_PRINTF("data_lo\n");
+    dumpMask(&data_lo, 32);
+    DEBUG_PRINTF("data_hi\n");
+    dumpMask(&data_hi, 32);
+    DEBUG_PRINTF("hi_mask_1\n");
+    dumpMask(&hi_mask_1, 16);
+    DEBUG_PRINTF("hi_mask_2\n");
+    dumpMask(&hi_mask_2, 16);
+    DEBUG_PRINTF("lo_mask_1\n");
+    dumpMask(&lo_mask_1, 16);
+    DEBUG_PRINTF("lo_mask_2\n");
+    dumpMask(&lo_mask_2, 16);
+    DEBUG_PRINTF("c_lo_1\n");
+    dumpMask(&c_lo_1, 32);
+    DEBUG_PRINTF("c_lo_2\n");
+    dumpMask(&c_lo_2, 32);
+    DEBUG_PRINTF("c_hi_1\n");
+    dumpMask(&c_hi_1, 32);
+    DEBUG_PRINTF("c_hi_2\n");
+    dumpMask(&c_hi_2, 32);
+    DEBUG_PRINTF("result\n");
+    dumpMask(&result, 32);
+    DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+#endif
+    u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+#endif
diff --git a/src/runtime.c b/src/runtime.c
index e761acc2..88e866dc 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -55,7 +55,6 @@
 #include "state.h"
 #include "ue2common.h"
 #include "util/exhaust.h"
-#include "util/fatbit.h"
 #include "util/multibit.h"
 
 static really_inline
@@ -291,12 +290,12 @@ void runSmallWriteEngine(const struct SmallWriteEngine *smwr,
     if (nfa->type == MCCLELLAN_NFA_8) {
         nfaExecMcClellan8_B(nfa, smwr->start_offset, local_buffer,
                             local_alen, roseReportAdaptor, scratch);
-    } else if (nfa->type == MCCLELLAN_NFA_16){
+    } else if (nfa->type == MCCLELLAN_NFA_16) {
         nfaExecMcClellan16_B(nfa, smwr->start_offset, local_buffer,
                              local_alen, roseReportAdaptor, scratch);
     } else {
-        nfaExecSheng0_B(nfa, smwr->start_offset, local_buffer,
-                        local_alen, roseReportAdaptor, scratch);
+        nfaExecSheng_B(nfa, smwr->start_offset, local_buffer,
+                       local_alen, roseReportAdaptor, scratch);
     }
 }
 
@@ -736,20 +735,11 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     assert(scratch);
     assert(!can_stop_matching(scratch));
 
-    char *state = getMultiState(stream_state);
-
     const struct RoseEngine *rose = stream_state->rose;
     const struct HWLM *ftable = getFLiteralMatcher(rose);
 
     size_t len2 = scratch->core_info.len;
 
-    u8 *hwlm_stream_state;
-    if (rose->floatingStreamState) {
-        hwlm_stream_state = getFloatingMatcherState(rose, state);
-    } else {
-        hwlm_stream_state = NULL;
-    }
-
     DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n",
                  stream_state->offset, scratch->core_info.len);
 
@@ -761,8 +751,8 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     // start the match region at zero.
     const size_t start = 0;
 
-    hwlmExecStreaming(ftable, scratch, len2, start, roseCallback,
-                      scratch, rose->initialGroups, hwlm_stream_state);
+    hwlmExecStreaming(ftable, scratch, len2, start, roseCallback, scratch,
+                      rose->initialGroups);
 
     if (!told_to_stop_matching(scratch) &&
         isAllExhausted(rose, scratch->core_info.exhaustionVector)) {
diff --git a/src/scratch.c b/src/scratch.c
index dae2c672..8cbe9760 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -43,17 +43,19 @@
 #include "nfa/nfa_api_queue.h"
 #include "rose/rose_internal.h"
 #include "util/fatbit.h"
-#include "util/multibit.h"
 
 /**
  * Determine the space required for a correctly aligned array of fatbit
  * structure, laid out as:
  *
  * - an array of num_entries pointers, each to a fatbit.
- * - an array of fatbit structures, each of size fatbit_size(num_keys).
+ * - an array of fatbit structures, each of size fatbit_len.
+ *
+ * fatbit_len should have been determined at compile time, via the
+ * fatbit_size() call.
  */
 static
-size_t fatbit_array_size(u32 num_entries, u32 num_keys) {
+size_t fatbit_array_size(u32 num_entries, u32 fatbit_len) {
     size_t len = 0;
 
     // Array of pointers to each fatbit entry.
@@ -61,7 +63,7 @@ size_t fatbit_array_size(u32 num_entries, u32 num_keys) {
 
     // Fatbit entries themselves.
     len = ROUNDUP_N(len, alignof(struct fatbit));
-    len += (size_t)fatbit_size(num_keys) * num_entries;
+    len += (size_t)fatbit_len * num_entries;
 
     return ROUNDUP_N(len, 8); // Round up for potential padding.
 }
@@ -71,17 +73,19 @@ size_t fatbit_array_size(u32 num_entries, u32 num_keys) {
 static
 hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     u32 queueCount = proto->queueCount;
-    u32 deduperCount = proto->deduper.log_size;
+    u32 activeQueueArraySize = proto->activeQueueArraySize;
+    u32 deduperCount = proto->deduper.dkey_count;
+    u32 deduperLogSize = proto->deduper.log_size;
     u32 bStateSize = proto->bStateSize;
     u32 tStateSize = proto->tStateSize;
     u32 fullStateSize = proto->fullStateSize;
     u32 anchored_literal_region_len = proto->anchored_literal_region_len;
-    u32 anchored_literal_region_width = proto->anchored_literal_count;
+    u32 anchored_literal_fatbit_size = proto->anchored_literal_fatbit_size;
 
     u32 som_store_size = proto->som_store_count * sizeof(u64a);
     u32 som_attempted_store_size = proto->som_store_count * sizeof(u64a);
-    u32 som_now_size = fatbit_size(proto->som_store_count);
-    u32 som_attempted_size = fatbit_size(proto->som_store_count);
+    u32 som_now_size = proto->som_fatbit_size;
+    u32 som_attempted_size = proto->som_fatbit_size;
 
     struct hs_scratch *s;
     struct hs_scratch *s_tmp;
@@ -91,18 +95,18 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     assert(anchored_literal_region_len < 8 * sizeof(s->al_log_sum));
 
     size_t anchored_literal_region_size = fatbit_array_size(
-        anchored_literal_region_len, anchored_literal_region_width);
+        anchored_literal_region_len, proto->anchored_literal_fatbit_size);
     size_t delay_region_size =
-        fatbit_array_size(DELAY_SLOT_COUNT, proto->delay_count);
+        fatbit_array_size(DELAY_SLOT_COUNT, proto->delay_fatbit_size);
 
     // the size is all the allocated stuff, not including the struct itself
     size_t size = queue_size + 63
                   + bStateSize + tStateSize
                   + fullStateSize + 63 /* cacheline padding */
-                  + fatbit_size(proto->handledKeyCount) /* handled roles */
-                  + fatbit_size(queueCount) /* active queue array */
-                  + 2 * fatbit_size(deduperCount) /* need odd and even logs */
-                  + 2 * fatbit_size(deduperCount) /* ditto som logs */
+                  + proto->handledKeyFatbitSize /* handled roles */
+                  + activeQueueArraySize /* active queue array */
+                  + 2 * deduperLogSize /* need odd and even logs */
+                  + 2 * deduperLogSize /* ditto som logs */
                   + 2 * sizeof(u64a) * deduperCount /* start offsets for som */
                   + anchored_literal_region_size + qmpq_size
                   + delay_region_size
@@ -157,7 +161,7 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) {
         s->delay_slots[i] = (struct fatbit *)current;
         assert(ISALIGNED(s->delay_slots[i]));
-        current += fatbit_size(proto->delay_count);
+        current += proto->delay_fatbit_size;
     }
 
     current = ROUNDUP_PTR(current, alignof(struct fatbit *));
@@ -167,7 +171,7 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     for (u32 i = 0; i < anchored_literal_region_len; i++) {
         s->al_log[i] = (struct fatbit *)current;
         assert(ISALIGNED(s->al_log[i]));
-        current += fatbit_size(anchored_literal_region_width);
+        current += anchored_literal_fatbit_size;
     }
 
     current = ROUNDUP_PTR(current, 8);
@@ -193,22 +197,22 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
 
     assert(ISALIGNED_N(current, 8));
     s->aqa = (struct fatbit *)current;
-    current += fatbit_size(queueCount);
+    current += activeQueueArraySize;
 
     s->handled_roles = (struct fatbit *)current;
-    current += fatbit_size(proto->handledKeyCount);
+    current += proto->handledKeyFatbitSize;
 
     s->deduper.log[0] = (struct fatbit *)current;
-    current += fatbit_size(deduperCount);
+    current += deduperLogSize;
 
     s->deduper.log[1] = (struct fatbit *)current;
-    current += fatbit_size(deduperCount);
+    current += deduperLogSize;
 
     s->deduper.som_log[0] = (struct fatbit *)current;
-    current += fatbit_size(deduperCount);
+    current += deduperLogSize;
 
     s->deduper.som_log[1] = (struct fatbit *)current;
-    current += fatbit_size(deduperCount);
+    current += deduperLogSize;
 
     s->som_set_now = (struct fatbit *)current;
     current += som_now_size;
@@ -293,19 +297,19 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch) {
         proto->anchored_literal_region_len = rose->anchoredDistance;
     }
 
-    if (rose->anchored_count > proto->anchored_literal_count) {
+    if (rose->anchored_fatbit_size > proto->anchored_literal_fatbit_size) {
         resize = 1;
-        proto->anchored_literal_count = rose->anchored_count;
+        proto->anchored_literal_fatbit_size = rose->anchored_fatbit_size;
     }
 
-    if (rose->delay_count > proto->delay_count) {
+    if (rose->delay_fatbit_size > proto->delay_fatbit_size) {
         resize = 1;
-        proto->delay_count = rose->delay_count;
+        proto->delay_fatbit_size = rose->delay_fatbit_size;
     }
 
-    if (rose->handledKeyCount > proto->handledKeyCount) {
+    if (rose->handledKeyFatbitSize > proto->handledKeyFatbitSize) {
         resize = 1;
-        proto->handledKeyCount = rose->handledKeyCount;
+        proto->handledKeyFatbitSize = rose->handledKeyFatbitSize;
     }
 
     if (rose->tStateSize > proto->tStateSize) {
@@ -319,12 +323,22 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch) {
         proto->som_store_count = som_store_count;
     }
 
+    if (rose->somLocationFatbitSize > proto->som_fatbit_size) {
+        resize = 1;
+        proto->som_fatbit_size = rose->somLocationFatbitSize;
+    }
+
     u32 queueCount = rose->queueCount;
     if (queueCount > proto->queueCount) {
         resize = 1;
         proto->queueCount = queueCount;
     }
 
+    if (rose->activeQueueArraySize > proto->activeQueueArraySize) {
+        resize = 1;
+        proto->activeQueueArraySize = rose->activeQueueArraySize;
+    }
+
     u32 bStateSize = 0;
     if (rose->mode == HS_MODE_BLOCK) {
         bStateSize = rose->stateOffsets.end;
@@ -344,9 +358,10 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch) {
         proto->fullStateSize = fullStateSize;
     }
 
-    if (rose->dkeyCount > proto->deduper.log_size) {
+    if (rose->dkeyCount > proto->deduper.dkey_count) {
         resize = 1;
-        proto->deduper.log_size = rose->dkeyCount;
+        proto->deduper.dkey_count = rose->dkeyCount;
+        proto->deduper.log_size = rose->dkeyLogSize;
     }
 
     if (resize) {
diff --git a/src/scratch.h b/src/scratch.h
index a2f02503..b59dc8d4 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -45,7 +45,7 @@ extern "C"
 #endif
 
 UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259;
-#define FDR_TEMP_BUF_SIZE 220
+#define FDR_TEMP_BUF_SIZE 222
 
 struct fatbit;
 struct hs_scratch;
@@ -122,12 +122,33 @@ struct RoseContext {
     u32 filledDelayedSlots;
     u32 curr_qi;    /**< currently executing main queue index during
                      * \ref nfaQueueExec */
+
+    /**
+     * \brief Buffer for caseful long literal support, used in streaming mode
+     * only.
+     *
+     * If a long literal prefix was at the end of the buffer at the end of a
+     * stream write, then the long lit table hashes it and stores the result in
+     * stream state. At the start of the next write, this value is used to set
+     * this buffer to the matching prefix string (stored in the bytecode.
+     */
+    const u8 *ll_buf;
+
+    /** \brief Length in bytes of the string pointed to by ll_buf. */
+    size_t ll_len;
+
+    /** \brief Caseless version of ll_buf. */
+    const u8 *ll_buf_nocase;
+
+    /** \brief Length in bytes of the string pointed to by ll_buf_nocase. */
+    size_t ll_len_nocase;
 };
 
 struct match_deduper {
     struct fatbit *log[2]; /**< even, odd logs */
     struct fatbit *som_log[2]; /**< even, odd fatbit logs for som */
     u64a *som_start_log[2]; /**< even, odd start offset logs for som */
+    u32 dkey_count;
     u32 log_size;
     u64a current_report_offset;
     u8 som_log_dirty;
@@ -142,6 +163,7 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
     u32 magic;
     u8 in_use; /**< non-zero when being used by an API call. */
     u32 queueCount;
+    u32 activeQueueArraySize; /**< size of active queue array fatbit in bytes */
     u32 bStateSize; /**< sizeof block mode states */
     u32 tStateSize; /**< sizeof transient rose states */
     u32 fullStateSize; /**< size of uncompressed nfa state */
@@ -159,7 +181,7 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
     struct core_info core_info;
     struct match_deduper deduper;
     u32 anchored_literal_region_len;
-    u32 anchored_literal_count;
+    u32 anchored_literal_fatbit_size; /**< size of each anch fatbit in bytes */
     struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already
                                    * handled by this literal */
     u64a *som_store; /**< array of som locations */
@@ -171,8 +193,9 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
                             * location had been writable */
     u64a som_set_now_offset; /**< offset at which som_set_now represents */
     u32 som_store_count;
-    u32 handledKeyCount;
-    u32 delay_count;
+    u32 som_fatbit_size; /**< size of som location fatbit structures in bytes */
+    u32 handledKeyFatbitSize; /**< size of handled_keys fatbit in bytes */
+    u32 delay_fatbit_size; /**< size of each delay fatbit in bytes */
     u32 scratchSize;
     char *scratch_alloc; /* user allocated scratch object */
     u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
diff --git a/src/scratch_dump.cpp b/src/scratch_dump.cpp
index 78a854bb..47c93c37 100644
--- a/src/scratch_dump.cpp
+++ b/src/scratch_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@
 #include "scratch_dump.h"
 #include "hs_internal.h"
 #include "ue2common.h"
-#include "util/multibit_internal.h"
+#include "util/multibit_build.h"
 #include "nfa/nfa_api_queue.h"
 #include "rose/rose_internal.h"
 
@@ -54,12 +54,11 @@ void dumpScratch(const struct hs_scratch *s, FILE *f) {
     fprintf(f, "  queues               : %zu bytes\n",
             s->queueCount * sizeof(struct mq));
     fprintf(f, "  bStateSize           : %u bytes\n", s->bStateSize);
-    fprintf(f, "  active queue array   : %u bytes\n",
-            mmbit_size(s->queueCount));
+    fprintf(f, "  active queue array   : %u bytes\n", s->activeQueueArraySize);
     fprintf(f, "  qmpq                 : %zu bytes\n",
             s->queueCount * sizeof(struct queue_match));
     fprintf(f, "  delay info           : %u bytes\n",
-            mmbit_size(s->delay_count) * DELAY_SLOT_COUNT);
+            s->delay_fatbit_size * DELAY_SLOT_COUNT);
 }
 
 } // namespace ue2
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 90770ba5..108bca8a 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -30,16 +30,18 @@
 
 #include "grey.h"
 #include "ue2common.h"
+#include "nfa/dfa_min.h"
 #include "nfa/mcclellancompile.h"
 #include "nfa/mcclellancompile_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/rdfa_merge.h"
 #include "nfa/shengcompile.h"
 #include "nfagraph/ng.h"
+#include "nfagraph/ng_depth.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_mcclellan.h"
+#include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_util.h"
-#include "nfagraph/ng_width.h"
 #include "smallwrite/smallwrite_internal.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
@@ -101,6 +103,74 @@ SmallWriteBuildImpl::SmallWriteBuildImpl(size_t num_patterns,
                || num_patterns > cc.grey.smallWriteMaxPatterns) {
 }
 
+/**
+ * \brief Remove any reports from the given vertex that cannot match within
+ * max_depth due to their constraints.
+ */
+static
+bool pruneOverlongReports(NFAVertex v, NGHolder &g, const depth &max_depth,
+                          const ReportManager &rm) {
+    assert(!g[v].reports.empty());
+
+    vector<ReportID> bad_reports;
+
+    for (ReportID id : g[v].reports) {
+        const auto &report = rm.getReport(id);
+        if (report.minOffset > max_depth) {
+            bad_reports.push_back(id);
+        }
+    }
+
+    for (ReportID id : bad_reports) {
+        g[v].reports.erase(id);
+    }
+
+    if (g[v].reports.empty()) {
+        DEBUG_PRINTF("none of vertex %zu's reports can match, cut accepts\n",
+                     g[v].index);
+        remove_edge(v, g.accept, g);
+        remove_edge(v, g.acceptEod, g);
+    }
+
+    return !bad_reports.empty();
+}
+
+/**
+ * \brief Prune vertices and reports from the graph that cannot match within
+ * max_depth.
+ */
+static
+bool pruneOverlong(NGHolder &g, const depth &max_depth,
+                   const ReportManager &rm) {
+    bool modified = false;
+    std::vector<NFAVertexDepth> depths;
+    calcDepths(g, depths);
+
+    for (auto v : vertices_range(g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+        const auto &d = depths.at(g[v].index);
+        depth min_depth = min(d.fromStart.min, d.fromStartDotStar.min);
+        if (min_depth > max_depth) {
+            clear_vertex(v, g);
+            modified = true;
+            continue;
+        }
+
+        if (is_match_vertex(v, g)) {
+            modified |= pruneOverlongReports(v, g, max_depth, rm);
+        }
+    }
+
+    if (modified) {
+        pruneUseless(g);
+        DEBUG_PRINTF("pruned graph down to %zu vertices\n", num_vertices(g));
+    }
+
+    return modified;
+}
+
 void SmallWriteBuildImpl::add(const NGWrapper &w) {
     // If the graph is poisoned (i.e. we can't build a SmallWrite version),
     // we don't even try.
@@ -118,13 +188,12 @@ void SmallWriteBuildImpl::add(const NGWrapper &w) {
     // make a copy of the graph so that we can modify it for our purposes
     unique_ptr<NGHolder> h = cloneHolder(w);
 
+    pruneOverlong(*h, depth(cc.grey.smallWriteLargestBuffer), rm);
+
     reduceGraph(*h, SOM_NONE, w.utf8, cc);
 
-    // If the earliest match location is outside the small write region,
-    // then we don't need to build a SmallWrite version.
-    // However, we don't poison this case either, since it is simply a case,
-    // where we know the resulting graph won't match.
-    if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) {
+    if (can_never_match(*h)) {
+        DEBUG_PRINTF("graph can never match in small block\n");
         return;
     }
 
@@ -140,7 +209,9 @@ void SmallWriteBuildImpl::add(const NGWrapper &w) {
         return;
     }
 
-    prune_overlong(*r, cc.grey.smallWriteLargestBuffer);
+    if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) {
+        minimize_hopcroft(*r, cc.grey);
+    }
 
     if (rdfa) {
         // do a merge of the new dfa with the existing dfa
@@ -350,6 +421,7 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
             return nullptr;
         }
         if (prune_overlong(rdfa, *small_region - *start_offset)) {
+            minimize_hopcroft(rdfa, cc.grey);
             if (rdfa.start_anchored == DEAD_STATE) {
                 DEBUG_PRINTF("all patterns pruned out\n");
                 return nullptr;
diff --git a/src/smallwrite/smallwrite_dump.cpp b/src/smallwrite/smallwrite_dump.cpp
index 0db97df5..bdf55c30 100644
--- a/src/smallwrite/smallwrite_dump.cpp
+++ b/src/smallwrite/smallwrite_dump.cpp
@@ -70,18 +70,11 @@ void smwrDumpNFA(const SmallWriteEngine *smwr, bool dump_raw,
     }
 
     const struct NFA *n = getSmwrNfa(smwr);
-    FILE *f;
 
-    f = fopen((base + "smallwrite_nfa.dot").c_str(), "w");
-    nfaDumpDot(n, f, base);
-    fclose(f);
-
-    f = fopen((base + "smallwrite_nfa.txt").c_str(), "w");
-    nfaDumpText(n, f);
-    fclose(f);
+    nfaGenerateDumpFiles(n, base + "smallwrite_nfa");
 
     if (dump_raw) {
-        f = fopen((base + "smallwrite_nfa.raw").c_str(), "w");
+        FILE *f = fopen((base + "smallwrite_nfa.raw").c_str(), "w");
         fwrite(n, 1, n->length, f);
         fclose(f);
     }
diff --git a/src/som/slot_manager.h b/src/som/slot_manager.h
index 9de78f44..971ea362 100644
--- a/src/som/slot_manager.h
+++ b/src/som/slot_manager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
 #define SLOT_MANAGER_H
 
 #include "ue2common.h"
-#include "nfagraph/ng_graph.h"
+#include "nfagraph/ng_holder.h"
 #include "util/alloc.h"
 #include "util/ue2_containers.h"
 
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index 6f1bcd09..d144e879 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -70,6 +70,7 @@
 #define CASE_BIT          0x20
 #define CASE_CLEAR        0xdf
 #define DOUBLE_CASE_CLEAR 0xdfdf
+#define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
 static really_inline
 u32 clz32(u32 x) {
@@ -470,4 +471,55 @@ u32 rank_in_mask64(u64a mask, u32 bit) {
     return popcount64(mask);
 }
 
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
+#define HAVE_PEXT
+#endif
+
+static really_inline
+u32 pext32(u32 x, u32 mask) {
+#if defined(HAVE_PEXT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u32(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_32(&mask);
+        if (x & (1U << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+static really_inline
+u64a pext64(u64a x, u64a mask) {
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u64(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_64(&mask);
+        if (x & (1ULL << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+    return _pdep_u64(x, mask);
+}
+#endif
+
 #endif // BITUTILS_H
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
index ea22779c..79f06932 100644
--- a/src/util/clique.cpp
+++ b/src/util/clique.cpp
@@ -103,7 +103,7 @@ bool graph_empty(const Graph &g) {
 }
 
 vector<vector<u32>> removeClique(CliqueGraph &cg) {
-    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
+    DEBUG_PRINTF("graph size:%zu\n", num_vertices(cg));
     vector<vector<u32>> cliquesVec = {findCliqueGroup(cg)};
     while (!graph_empty(cg)) {
         const vector<u32> &c = cliquesVec.back();
diff --git a/src/util/container.h b/src/util/container.h
index 63e27743..e2cfb485 100644
--- a/src/util/container.h
+++ b/src/util/container.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,7 @@
 #include <set>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 namespace ue2 {
 
@@ -78,7 +79,9 @@ void insert(C *container, typename C::iterator pos, const D &donor) {
 }
 
 /**
- * \brief Constructs a vector from a range bounded by the given pair of iterators. */
+ * \brief Constructs a vector from a range bounded by the given pair of
+ * iterators.
+ */
 template <typename It>
 auto make_vector_from(const std::pair<It, It> &range)
     -> std::vector<decltype(*range.first)> {
diff --git a/src/util/copybytes.h b/src/util/copybytes.h
new file mode 100644
index 00000000..872b8d28
--- /dev/null
+++ b/src/util/copybytes.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COPY_BYTES_H
+#define COPY_BYTES_H
+
+#include "unaligned.h"
+#include "simd_utils.h"
+
+static really_inline
+void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    case 32:
+        storeu256(dst, loadu256(src));
+        break;
+    default:
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+#endif
diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
index 9a8bd922..dba147ee 100644
--- a/src/util/cpuid_flags.c
+++ b/src/util/cpuid_flags.c
@@ -40,12 +40,14 @@
 #define SSSE3 (1 << 9)
 #define SSE4_1 (1 << 19)
 #define SSE4_2 (1 << 20)
+#define POPCNT (1 << 23)
 #define XSAVE (1 << 27)
 #define AVX (1 << 28)
 
 // EDX
+#define FXSAVE (1 << 24)
 #define SSE (1 << 25)
-#define SSE2 (1 << 25)
+#define SSE2 (1 << 26)
 #define HTT (1 << 28)
 
 // Structured Extended Feature Flags Enumeration Leaf ECX values
@@ -87,7 +89,6 @@ u64a xgetbv(u32 op) {
 #endif
 }
 
-static
 int check_avx2(void) {
 #if defined(__INTEL_COMPILER)
     return _may_i_use_cpu_feature(_FEATURE_AVX2);
@@ -137,6 +138,24 @@ u64a cpuid_flags(void) {
     return cap;
 }
 
+int check_ssse3(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & SSSE3);
+}
+
+int check_sse42(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & SSE4_2);
+}
+
+int check_popcnt(void) {
+    unsigned int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return !!(ecx & POPCNT);
+}
+
 struct family_id {
     u32 full_family;
     u32 full_model;
diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h
index 2df97ab5..8b23d495 100644
--- a/src/util/cpuid_flags.h
+++ b/src/util/cpuid_flags.h
@@ -41,6 +41,11 @@ u64a cpuid_flags(void);
 
 u32 cpuid_tune(void);
 
+int check_avx2(void);
+int check_ssse3(void);
+int check_sse42(void);
+int check_popcnt(void);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/util/dump_charclass.cpp b/src/util/dump_charclass.cpp
index 74b45414..4c159ec2 100644
--- a/src/util/dump_charclass.cpp
+++ b/src/util/dump_charclass.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -249,6 +249,15 @@ string describeClass(const CharReach &cr, size_t maxLength,
     return oss.str();
 }
 
+string describeClasses(const std::vector<CharReach> &v, size_t maxClassLength,
+                       enum cc_output_t out_type) {
+    std::ostringstream oss;
+    for (const auto &cr : v) {
+        describeClass(oss, cr, maxClassLength, out_type);
+    }
+    return oss.str();
+}
+
 // C stdio wrapper
 void describeClass(FILE *f, const CharReach &cr, size_t maxLength,
                    enum cc_output_t out_type) {
diff --git a/src/util/dump_charclass.h b/src/util/dump_charclass.h
index 9c3362bc..45b707f1 100644
--- a/src/util/dump_charclass.h
+++ b/src/util/dump_charclass.h
@@ -38,6 +38,7 @@
 #include <cstdio>
 #include <ostream>
 #include <string>
+#include <vector>
 
 namespace ue2 {
 
@@ -54,6 +55,10 @@ void describeClass(std::ostream &os, const CharReach &cr, size_t maxLength = 16,
 std::string describeClass(const CharReach &cr, size_t maxLength = 16,
                           enum cc_output_t out_type = CC_OUT_TEXT);
 
+std::string describeClasses(const std::vector<CharReach> &v,
+                            size_t maxClassLength = 16,
+                            enum cc_output_t out_type = CC_OUT_TEXT);
+
 void describeClass(FILE *f, const CharReach &cr, size_t maxLength,
                    enum cc_output_t out_type);
 
diff --git a/src/util/dump_util.cpp b/src/util/dump_util.cpp
new file mode 100644
index 00000000..5b961367
--- /dev/null
+++ b/src/util/dump_util.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dump_util.h"
+
+#include <stdexcept>
+#include <string>
+
+using namespace std;
+
+FILE *fopen_or_throw(const char *path, const char *mode) {
+    FILE *f = fopen(path, mode);
+    if (!f) {
+        throw runtime_error(string("Unable to open file: ") + path);
+    }
+    return f;
+}
diff --git a/src/util/dump_util.h b/src/util/dump_util.h
new file mode 100644
index 00000000..487d2e7c
--- /dev/null
+++ b/src/util/dump_util.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DUMP_UTIL
+#define DUMP_UTIL
+
+#include <cstdio>
+
+/**
+ * Same as fopen(), but on error throws an exception rather than returning NULL.
+ */
+FILE *fopen_or_throw(const char *path, const char *mode);
+
+#endif
diff --git a/src/util/fatbit.h b/src/util/fatbit.h
index ad607638..3c65db1a 100644
--- a/src/util/fatbit.h
+++ b/src/util/fatbit.h
@@ -40,6 +40,10 @@
 #include "multibit.h"
 #include "ue2common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MIN_FAT_SIZE 32
 
 struct fatbit {
@@ -82,11 +86,8 @@ u32 fatbit_iterate(const struct fatbit *bits, u32 total_bits, u32 it_in) {
     return mmbit_iterate(bits->fb_int.raw, total_bits, it_in);
 }
 
-/** \brief Return the size in bytes of a fatbit that can store the given
- * number of bits.
- *
- * Not for use in performance-critical code, implementation is in fatbit.c.
- */
-u32 fatbit_size(u32 total_bits);
+#ifdef __cplusplus
+} // extern "C"
+#endif
 
 #endif
diff --git a/src/util/fatbit_build.cpp b/src/util/fatbit_build.cpp
new file mode 100644
index 00000000..77f4b550
--- /dev/null
+++ b/src/util/fatbit_build.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fatbit_build.h"
+
+#include "fatbit.h"
+#include "multibit_build.h"
+
+#include <algorithm>
+
+using namespace std;
+
+namespace ue2 {
+
+u32 fatbit_size(u32 total_bits) {
+    return max(u32{sizeof(struct fatbit)}, mmbit_size(total_bits));
+}
+
+} // namespace ue2
diff --git a/src/util/fatbit_build.h b/src/util/fatbit_build.h
new file mode 100644
index 00000000..d7611657
--- /dev/null
+++ b/src/util/fatbit_build.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Fatbit: build code
+ */
+
+#ifndef FATBIT_BUILD_H
+#define FATBIT_BUILD_H
+
+#include "ue2common.h"
+
+namespace ue2 {
+
+/**
+ * \brief Return the size in bytes of a fatbit that can store the given
+ * number of bits.
+ */
+u32 fatbit_size(u32 total_bits);
+
+} // namespace ue2
+
+#endif // FATBIT_BUILD_H
diff --git a/src/util/graph.h b/src/util/graph.h
index 90589f14..4c2876f1 100644
--- a/src/util/graph.h
+++ b/src/util/graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,71 +38,22 @@
 #include "util/graph_range.h"
 #include "util/ue2_containers.h"
 
-#include <boost/graph/adjacency_iterator.hpp>
-#include <boost/graph/adjacency_list.hpp>
 #include <boost/graph/depth_first_search.hpp>
-#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/strong_components.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
 
 namespace ue2 {
 
 /** \brief True if the given vertex has no out-edges. */
 template<class Graph>
 bool isLeafNode(const typename Graph::vertex_descriptor& v, const Graph& g) {
-    typename Graph::adjacency_iterator ai, ae;
-    std::tie(ai, ae) = adjacent_vertices(v, g);
-    return ai == ae; // no out edges
-}
-
-/** \brief True if the out-degree of vertex \a v is greater than the given
- * limit. */
-template<class Graph>
-bool hasGreaterOutDegree(size_t limit,
-                         const typename Graph::vertex_descriptor& v,
-                         const Graph& g) {
-    typename Graph::out_edge_iterator ei, ee;
-    for (std::tie(ei, ee) = out_edges(v, g); ei != ee; ++ei) {
-        if (limit-- == 0) {
-            return true;
-        }
-    }
-    return false;
-}
-
-/** \brief Returns true if the in-degree of vertex \a v is greater than the
- * given limit. */
-template<class Graph>
-bool hasGreaterInDegree(size_t limit,
-                        const typename Graph::vertex_descriptor& v,
-                        const Graph& g) {
-    typename Graph::in_edge_iterator ei, ee;
-    for (std::tie(ei, ee) = in_edges(v, g); ei != ee; ++ei) {
-        if (limit-- == 0) {
-            return true;
-        }
-    }
-    return false;
-}
-
-/**
- * \brief True if the degree of vertex \a v is greater than the given limit.
- */
-template <class Graph>
-bool has_greater_degree(size_t limit,
-                        const typename Graph::vertex_descriptor &v,
-                        const Graph &g) {
-    typename Graph::in_edge_iterator ei, ee;
-    for (std::tie(ei, ee) = in_edges(v, g); ei != ee; ++ei) {
-        if (limit-- == 0) {
-            return true;
-        }
-    }
-    typename Graph::out_edge_iterator oi, oe;
-    for (std::tie(oi, oe) = out_edges(v, g); oi != oe; ++oi) {
-        if (limit-- == 0) {
-            return true;
-        }
-    }
-    return false;
+    return out_degree(v, g) == 0;
 }
 
 /** \brief True if vertex \a v has an edge to itself. */
@@ -137,48 +88,10 @@ size_t proper_in_degree(const typename Graph::vertex_descriptor &v,
     return in_degree(v, g) - (edge(v, v, g).second ? 1 : 0);
 }
 
-/** \brief Returns true iff the in-degree of vertex \a v is \a expected */
-template<class Graph>
-bool in_degree_equal_to(const typename Graph::vertex_descriptor &v,
-                        const Graph &g, size_t expected) {
-    size_t seen = 0;
-    typename Graph::in_edge_iterator ei, ee;
-    for (std::tie(ei, ee) = in_edges(v, g);; ++ei, seen++) {
-        if (seen == expected) {
-            return ei == ee;
-        }
-        if (ei == ee) {
-            return false;
-        }
-    }
-}
-
-/** \brief same as edge(s, t, g) by finds edge by inspecting in-edges of target.
- * Should be used when it is known that t has a small in-degree and when s
- * may have a large out-degree.
- */
-template<class Graph>
-std::pair<typename Graph::edge_descriptor, bool>
-edge_by_target(const typename Graph::vertex_descriptor &s,
-               const typename Graph::vertex_descriptor &t, const Graph &g) {
-    typename Graph::in_edge_iterator ei, ee;
-    for (std::tie(ei, ee) = in_edges(t, g); ei != ee; ++ei) {
-        if (source(*ei, g) == s) {
-            return std::make_pair(*ei, true);
-        }
-    }
-
-    return std::make_pair(typename Graph::edge_descriptor(), false);
-}
-
-
 /** \brief True if vertex \a v has at least one successor. */
 template<class Graph>
 bool has_successor(const typename Graph::vertex_descriptor &v, const Graph &g) {
-    typename Graph::adjacency_iterator ai, ae;
-    std::tie(ai, ae) = adjacent_vertices(v, g);
-
-    return ai != ae;
+    return out_degree(v, g) > 0;
 }
 
 /** \brief True if vertex \a v has at least one successor other than itself. */
@@ -197,26 +110,6 @@ bool has_proper_successor(const typename Graph::vertex_descriptor &v,
     return ai != ae;
 }
 
-/** \brief A version of clear_vertex that explicitly removes in- and out-edges
- * for vertex \a v. For many graphs, this is faster than the BGL clear_vertex
- * function, which walks the graph's full edge list. */
-template <class Graph>
-void clear_vertex_faster(typename Graph::vertex_descriptor v, Graph &g) {
-    typename Graph::in_edge_iterator ei, ee;
-    tie(ei, ee) = in_edges(v, g);
-    while (ei != ee) {
-        remove_edge(*ei++, g);
-    }
-
-    typename Graph::out_edge_iterator oi, oe;
-    tie(oi, oe) = out_edges(v, g);
-    while (oi != oe) {
-        // NOTE: version that takes out_edge_iterator is faster according to
-        // the BGL docs.
-        remove_edge(oi++, g);
-    }
-}
-
 /** \brief Find the set of vertices that are reachable from the vertices in \a
  * sources. */
 template<class Graph, class SourceCont, class OutCont>
@@ -251,6 +144,41 @@ void find_unreachable(const Graph &g, const SourceCont &sources, OutCont *out) {
     }
 }
 
+template <class Graph>
+ue2::flat_set<typename Graph::vertex_descriptor>
+find_vertices_in_cycles(const Graph &g) {
+    using vertex_descriptor = typename Graph::vertex_descriptor;
+
+    std::map<vertex_descriptor, size_t> comp_map;
+
+    boost::strong_components(g, boost::make_assoc_property_map(comp_map));
+
+    std::map<size_t, std::vector<vertex_descriptor>> comps;
+
+    for (const auto &e : comp_map) {
+        comps[e.second].push_back(e.first);
+    }
+
+    ue2::flat_set<vertex_descriptor> rv;
+
+    for (const auto &comp : comps | boost::adaptors::map_values) {
+        /* every vertex in a strongly connected component is reachable from
+         * every other vertex in the component. A vertex is involved in a cycle
+         * therefore if it is in a strongly connected component with more than
+         * one vertex or if it is the only vertex and it has a self loop. */
+        assert(!comp.empty());
+        if (comp.size() > 1) {
+            insert(&rv, comp);
+        }
+        vertex_descriptor v = *comp.begin();
+        if (hasSelfLoop(v, g)) {
+            rv.insert(v);
+        }
+    }
+
+    return rv;
+}
+
 template <class Graph>
 bool has_parallel_edge(const Graph &g) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
@@ -291,6 +219,22 @@ bool is_dag(const Graph &g, bool ignore_self_loops = false) {
     return true;
 }
 
+template<typename Cont>
+class vertex_recorder : public boost::default_dfs_visitor {
+public:
+    explicit vertex_recorder(Cont &o) : out(o) {}
+    template<class G>
+    void discover_vertex(typename Cont::value_type v, const G &) {
+        out.insert(v);
+    }
+    Cont &out;
+};
+
+template<typename Cont>
+vertex_recorder<Cont> make_vertex_recorder(Cont &o) {
+    return vertex_recorder<Cont>(o);
+}
+
 template <class Graph>
 std::pair<typename Graph::edge_descriptor, bool>
 add_edge_if_not_present(typename Graph::vertex_descriptor u,
@@ -313,6 +257,40 @@ std::pair<typename Graph::edge_descriptor, bool> add_edge_if_not_present(
     return e;
 }
 
+#ifndef NDEBUG
+
+template <class Graph>
+bool hasCorrectlyNumberedVertices(const Graph &g) {
+    auto count = num_vertices(g);
+    std::vector<bool> ids(count, false);
+    for (auto v : vertices_range(g)) {
+        auto id = g[v].index;
+        if (id >= count || ids[id]) {
+            return false; // duplicate
+        }
+        ids[id] = true;
+    }
+    return std::find(ids.begin(), ids.end(), false) == ids.end()
+        && count == vertex_index_upper_bound(g);
+}
+
+template <class Graph>
+bool hasCorrectlyNumberedEdges(const Graph &g) {
+    auto count = num_edges(g);
+    std::vector<bool> ids(count, false);
+    for (const auto &e : edges_range(g)) {
+        auto id = g[e].index;
+        if (id >= count || ids[id]) {
+            return false; // duplicate
+        }
+        ids[id] = true;
+    }
+    return std::find(ids.begin(), ids.end(), false) == ids.end()
+        && count == edge_index_upper_bound(g);
+}
+
+#endif
+
 } // namespace ue2
 
 #endif // UTIL_GRAPH_H
diff --git a/src/util/graph_range.h b/src/util/graph_range.h
index 82814695..3df06911 100644
--- a/src/util/graph_range.h
+++ b/src/util/graph_range.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,7 +51,6 @@
 #ifndef UTIL_GRAPH_RANGE_H
 #define UTIL_GRAPH_RANGE_H
 
-#include <boost/graph/adjacency_list.hpp>
 #include <boost/range/iterator_range.hpp>
 
 namespace ue2 {
diff --git a/src/util/hash.h b/src/util/hash.h
new file mode 100644
index 00000000..0b571772
--- /dev/null
+++ b/src/util/hash.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Hashing utility functions.
+ */
+
+#ifndef UTIL_HASH_H
+#define UTIL_HASH_H
+
+#include <boost/functional/hash/hash_fwd.hpp>
+
+namespace ue2 {
+
+namespace hash_detail {
+
+template<typename T>
+void hash_build(size_t &v, const T &obj) {
+    boost::hash_combine(v, obj);
+}
+
+template<typename T, typename... Args>
+void hash_build(size_t &v, const T &obj, Args&&... args) {
+    hash_build(v, obj);
+    hash_build(v, args...); // recursive
+}
+
+} // namespace hash_detail
+
+/**
+ * \brief Computes the combined hash of all its arguments.
+ *
+ * Simply use:
+ *
+ *     size_t hash = hash_all(a, b, c, d);
+ *
+ * Where a, b, c and d are hashable.
+ */
+template<typename... Args>
+size_t hash_all(Args&&... args) {
+    size_t v = 0;
+    hash_detail::hash_build(v, args...);
+    return v;
+}
+
+} // namespace ue2
+
+#endif // UTIL_HASH_H
diff --git a/src/util/masked_move.c b/src/util/masked_move.c
index 71406308..ec788db7 100644
--- a/src/util/masked_move.c
+++ b/src/util/masked_move.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
 /* masks for masked moves */
 
 /* magic mask for maskload (vmmaskmovq) - described in UE-2424 */
-const u32 mm_mask_mask[16] ALIGN_CL_DIRECTIVE = {
+const ALIGN_CL_DIRECTIVE u32 mm_mask_mask[16] = {
     0x00000000U,
     0x00000000U,
     0x00000000U,
diff --git a/src/util/multibit.c b/src/util/multibit.c
index c22b73ff..de192d7d 100644
--- a/src/util/multibit.c
+++ b/src/util/multibit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -138,62 +138,3 @@ const u32 mmbit_root_offset_from_level[7] = {
     1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4),
     1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4) + (1 << MMB_KEY_SHIFT * 5),
 };
-
-u32 mmbit_size(u32 total_bits) {
-    MDEBUG_PRINTF("%u\n", total_bits);
-
-    // Flat model multibit structures are just stored as a bit vector.
-    if (total_bits <= MMB_FLAT_MAX_BITS) {
-        return ROUNDUP_N(total_bits, 8) / 8;
-    }
-
-    u64a current_level = 1; // Number of blocks on current level.
-    u64a total = 0;         // Total number of blocks.
-    while (current_level * MMB_KEY_BITS < total_bits) {
-        total += current_level;
-        current_level <<= MMB_KEY_SHIFT;
-    }
-
-    // Last level is a one-for-one bit vector. It needs room for total_bits
-    // elements, rounded up to the nearest block.
-    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
-    total += last_level;
-
-    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
-    return (u32)(total * sizeof(MMB_TYPE));
-}
-
-#ifdef DUMP_SUPPORT
-
-#include <stdio.h>
-#include <stdlib.h>
-
-/** \brief Dump a sparse iterator's keys to stdout. */
-void mmbit_sparse_iter_dump(const struct mmbit_sparse_iter *it,
-                            u32 total_bits) {
-    // Expediency and future-proofing: create a temporary multibit of the right
-    // size with all the bits on, then walk it with this sparse iterator.
-    size_t bytes = mmbit_size(total_bits);
-    u8 *bits = malloc(bytes);
-    if (!bits) {
-        printf("Failed to alloc %zu bytes for temp multibit", bytes);
-        return;
-    }
-    for (u32 i = 0; i < total_bits; i++) {
-        mmbit_set_i(bits, total_bits, i);
-    }
-
-    struct mmbit_sparse_state s[MAX_SPARSE_ITER_STATES];
-    u32 idx = 0;
-    for (u32 i = mmbit_sparse_iter_begin(bits, total_bits, &idx, it, s);
-             i != MMB_INVALID;
-             i = mmbit_sparse_iter_next(bits, total_bits, i, &idx, it, s)) {
-        printf("%u ", i);
-    }
-
-    printf("(%u keys)", idx + 1);
-
-    free(bits);
-}
-
-#endif // DUMP_SUPPORT
diff --git a/src/util/multibit.h b/src/util/multibit.h
index ddc8bbdd..4df8733a 100644
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@@ -162,7 +162,7 @@ u32 mmb_popcount(MMB_TYPE val) {
 }
 
 #ifndef MMMB_DEBUG
-#define MDEBUG_PRINTF(x, ...) do { } while(0);
+#define MDEBUG_PRINTF(x, ...) do { } while(0)
 #else
 #define MDEBUG_PRINTF DEBUG_PRINTF
 #endif
diff --git a/src/util/multibit_build.cpp b/src/util/multibit_build.cpp
index 2a402d8c..5fe2d617 100644
--- a/src/util/multibit_build.cpp
+++ b/src/util/multibit_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "scatter.h"
 #include "ue2common.h"
 #include "rose/rose_build_scatter.h"
+#include "util/compile_error.h"
 
 #include <cassert>
 #include <cstring> // for memset
@@ -45,6 +46,32 @@ using namespace std;
 
 namespace ue2 {
 
+u32 mmbit_size(u32 total_bits) {
+    if (total_bits > MMB_MAX_BITS) {
+        throw ResourceLimitError();
+    }
+
+    // Flat model multibit structures are just stored as a bit vector.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        return ROUNDUP_N(total_bits, 8) / 8;
+    }
+
+    u64a current_level = 1; // Number of blocks on current level.
+    u64a total = 0;         // Total number of blocks.
+    while (current_level * MMB_KEY_BITS < total_bits) {
+        total += current_level;
+        current_level <<= MMB_KEY_SHIFT;
+    }
+
+    // Last level is a one-for-one bit vector. It needs room for total_bits
+    // elements, rounded up to the nearest block.
+    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
+    total += last_level;
+
+    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
+    return (u32)(total * sizeof(MMB_TYPE));
+}
+
 namespace {
 struct TreeNode {
     MMB_TYPE mask = 0;
@@ -133,6 +160,7 @@ void mmbBuildSparseIterator(vector<mmbit_sparse_iter> &out,
     assert(out.empty());
     assert(!bits.empty());
     assert(total_bits > 0);
+    assert(total_bits <= MMB_MAX_BITS);
 
     DEBUG_PRINTF("building sparse iter for %zu of %u bits\n",
                  bits.size(), total_bits);
diff --git a/src/util/multibit_build.h b/src/util/multibit_build.h
index ac263552..951f1fb4 100644
--- a/src/util/multibit_build.h
+++ b/src/util/multibit_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,20 +34,31 @@
 #define MULTIBIT_BUILD_H
 
 #include "multibit_internal.h"
+#include "hash.h"
 
 #include <vector>
 
-/** \brief Comparator for \ref mmbit_sparse_iter structures. */
-static inline
-bool operator<(const mmbit_sparse_iter &a, const mmbit_sparse_iter &b) {
-    if (a.mask != b.mask) {
-        return a.mask < b.mask;
-    }
-    return a.val < b.val;
+inline
+bool operator==(const mmbit_sparse_iter &a, const mmbit_sparse_iter &b) {
+    return a.mask == b.mask && a.val == b.val;
+}
+
+inline
+size_t hash_value(const mmbit_sparse_iter &iter) {
+    return ue2::hash_all(iter.mask, iter.val);
 }
 
 namespace ue2 {
 
+/**
+ * \brief Return the size in bytes of a multibit that can store the given
+ * number of bits.
+ *
+ * This will throw a resource limit assertion if the requested mmbit is too
+ * large.
+ */
+u32 mmbit_size(u32 total_bits);
+
 /** \brief Construct a sparse iterator over the values in \a bits for a
  * multibit of size \a total_bits. */
 void mmbBuildSparseIterator(std::vector<mmbit_sparse_iter> &out,
diff --git a/src/util/multibit_internal.h b/src/util/multibit_internal.h
index de87fe2a..350f3bfd 100644
--- a/src/util/multibit_internal.h
+++ b/src/util/multibit_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,9 @@ extern "C" {
 typedef u64a MMB_TYPE; /**< Basic block type for mmbit operations. */
 #define MMB_MAX_LEVEL 6 /**< Maximum level in the mmbit pyramid. */
 
+/** \brief Maximum number of keys (bits) in a multibit. */
+#define MMB_MAX_BITS (1U << 31)
+
 /** \brief Sparse iterator record type.
  *
  * A sparse iterator is a tree of these records, where val identifies the
@@ -71,13 +74,6 @@ struct mmbit_sparse_state {
 /** \brief Maximum number of \ref mmbit_sparse_state that could be needed. */
 #define MAX_SPARSE_ITER_STATES (6 + 1)
 
-/** \brief Return the size in bytes of a multibit that can store the given
- * number of bits.
- *
- * Not for use in performance-critical code, implementation is in multibit.c.
- */
-u32 mmbit_size(u32 total_bits);
-
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index e4541411..d6e5d6a3 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -61,7 +61,12 @@
 #error no intrinsics!
 #endif
 
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
 typedef __m128i m128;
+#else
+typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+#endif
+
 #if defined(__AVX2__)
 typedef __m256i m256;
 #else
diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c
index a86c568d..54b5b4ba 100644
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -32,7 +32,7 @@
 
 #include "simd_utils.h"
 
-const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
+ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = {
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
 
@@ -48,7 +48,7 @@ const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
 
 /** \brief LUT for the mask1bit functions. */
-const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = {
+ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
     ZEROES_31, 0x01, ZEROES_32,
     ZEROES_31, 0x02, ZEROES_32,
     ZEROES_31, 0x04, ZEROES_32,
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 3544629f..e8676249 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -71,6 +71,7 @@
 
 #include "ue2common.h"
 #include "simd_types.h"
+#include "unaligned.h"
 
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
@@ -158,6 +159,10 @@ static really_inline m128 set16x8(u8 c) {
     return _mm_set1_epi8(c);
 }
 
+static really_inline m128 set4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
 static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
@@ -172,6 +177,20 @@ static really_inline u64a movq(const m128 in) {
 #endif
 }
 
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    /* unfortunately _mm_loadl_epi64() is best avoided as it seems to cause
+     * trouble on some older compilers, possibly because it is misdefined to
+     * take an m128 as its parameter */
+    return _mm_set_epi64((__m64)0ULL, (__m64)*p);
+#else
+    /* ICC doesn't like casting to __m64 */
+    return _mm_loadl_epi64((const m128 *)p);
+#endif
+}
+
 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
 
@@ -245,7 +264,13 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
 
 static really_inline
 m128 mask1bit128(unsigned int n) {
@@ -269,12 +294,12 @@ void clearbit128(m128 *ptr, unsigned int n) {
 
 // tests bit N in the given vector.
 static really_inline
-char testbit128(const m128 *ptr, unsigned int n) {
+char testbit128(m128 val, unsigned int n) {
     const m128 mask = mask1bit128(n);
 #if defined(__SSE4_1__)
-    return !_mm_testz_si128(mask, *ptr);
+    return !_mm_testz_si128(mask, val);
 #else
-    return isnonzero128(and128(mask, *ptr));
+    return isnonzero128(and128(mask, val));
 #endif
 }
 
@@ -307,6 +332,25 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
     return pshufb(in, shift_mask);
 }
 
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
 
 /****
  **** 256-bit Primitives
@@ -354,6 +398,26 @@ m256 set32x8(u32 in) {
     return rv;
 }
 
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline
+m256 set2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
 #endif
 
 static really_inline m256 zeroes256(void) {
@@ -504,6 +568,10 @@ static really_inline m256 load2x128(const void *ptr) {
 #endif
 }
 
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set2x128(loadu128(ptr));
+}
+
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
     assert(ISALIGNED_N(ptr, alignof(m256)));
@@ -525,6 +593,16 @@ static really_inline m256 loadu256(const void *ptr) {
 #endif
 }
 
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+#if defined(__AVX2__)
+    _mm256_storeu_si256((m256 *)ptr, a);
+#else
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+#endif
+}
+
 // packed unaligned store of first N bytes
 static really_inline
 void storebytes256(void *ptr, m256 a, unsigned int n) {
@@ -580,18 +658,34 @@ void clearbit256(m256 *ptr, unsigned int n) {
 
 // tests bit N in the given vector.
 static really_inline
-char testbit256(const m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    const m128 *sub;
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
     if (n < 128) {
-        sub = &ptr->lo;
+        sub = val.lo;
     } else {
-        sub = &ptr->hi;
+        sub = val.hi;
         n -= 128;
     }
     return testbit128(sub, n);
 }
 
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
 #else // AVX2
 
 // switches on bit N in the given vector.
@@ -607,9 +701,9 @@ void clearbit256(m256 *ptr, unsigned int n) {
 
 // tests bit N in the given vector.
 static really_inline
-char testbit256(const m256 *ptr, unsigned int n) {
+char testbit256(m256 val, unsigned int n) {
     const m256 mask = mask1bit256(n);
-    return !_mm256_testz_si256(mask, *ptr);
+    return !_mm256_testz_si256(mask, val);
 }
 
 static really_really_inline
@@ -636,6 +730,14 @@ m128 movdq_lo(m256 x) {
 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b);
 #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
 
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+#if defined(_mm256_set_m128i)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return insert128to256(cast128to256(lo), hi, 1);
+#endif
+}
 #endif //AVX2
 
 /****
@@ -801,15 +903,15 @@ void clearbit384(m384 *ptr, unsigned int n) {
 
 // tests bit N in the given vector.
 static really_inline
-char testbit384(const m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    const m128 *sub;
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
     if (n < 128) {
-        sub = &ptr->lo;
+        sub = val.lo;
     } else if (n < 256) {
-        sub = &ptr->mid;
+        sub = val.mid;
     } else {
-        sub = &ptr->hi;
+        sub = val.hi;
     }
     return testbit128(sub, n % 128);
 }
@@ -1014,26 +1116,26 @@ void clearbit512(m512 *ptr, unsigned int n) {
 
 // tests bit N in the given vector.
 static really_inline
-char testbit512(const m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
 #if !defined(__AVX2__)
-    const m128 *sub;
+    m128 sub;
     if (n < 128) {
-        sub = &ptr->lo.lo;
+        sub = val.lo.lo;
     } else if (n < 256) {
-        sub = &ptr->lo.hi;
+        sub = val.lo.hi;
     } else if (n < 384) {
-        sub = &ptr->hi.lo;
+        sub = val.hi.lo;
     } else {
-        sub = &ptr->hi.hi;
+        sub = val.hi.hi;
     }
     return testbit128(sub, n % 128);
 #else
-    const m256 *sub;
+    m256 sub;
     if (n < 256) {
-        sub = &ptr->lo;
+        sub = val.lo;
     } else {
-        sub = &ptr->hi;
+        sub = val.hi;
         n -= 256;
     }
     return testbit256(sub, n);
diff --git a/src/util/ue2_containers.h b/src/util/ue2_containers.h
index 217d08ea..5bbf4cfe 100644
--- a/src/util/ue2_containers.h
+++ b/src/util/ue2_containers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -207,6 +207,10 @@ public:
         return std::make_pair(iterator(it), false);
     }
 
+    iterator insert(UNUSED const_iterator hint, const value_type &value) {
+        return insert(value).first;
+    }
+
     std::pair<iterator, bool> insert(value_type &&value) {
         auto it = std::lower_bound(data.begin(), data.end(), value, comp);
         if (it == data.end() || comp(value, *it)) {
@@ -216,6 +220,10 @@ public:
         return std::make_pair(iterator(it), false);
     }
 
+    iterator insert(UNUSED const_iterator hint, value_type &&value) {
+        return insert(value).first;
+    }
+
     template <class InputIt>
     void insert(InputIt first, InputIt second) {
         for (; first != second; ++first) {
diff --git a/src/util/ue2_graph.h b/src/util/ue2_graph.h
new file mode 100644
index 00000000..9634b032
--- /dev/null
+++ b/src/util/ue2_graph.h
@@ -0,0 +1,1304 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UE2_GRAPH_H
+#define UE2_GRAPH_H
+
+#include "ue2common.h"
+#include "util/graph_range.h"
+
+#include <boost/operators.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/graph/properties.hpp> /* vertex_index_t, ... */
+#include <boost/pending/property.hpp> /* no_property */
+#include <boost/property_map/property_map.hpp>
+#include <boost/intrusive/list.hpp>
+#include <boost/iterator/iterator_adaptor.hpp>
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <tuple> /* tie */
+#include <utility> /* pair, declval */
+
+/*
+ * Basic design of ue2_graph:
+ *
+ * Fairly standard adjacency list type graph structure. The main internal
+ * structures are vertex_node and edge_node.
+ *
+ * Each vertex_node maintains lists of incoming and outgoing edge_nodes, a
+ * serial number and the vertex properties.
+ *
+ * Each edge_node contains pointers to the source and target vertex as well as
+ * the serial number and edge properties.
+ *
+ * Every time an edge_node or vertex_node is created in the graph, it is given a
+ * unique serial number by increasing a private counter in the graph.
+ *
+ * The main thing to note is that the in and out edge lists are intrusive lists
+ * with the edge_node containing the necessary hooks. This means that we can
+ * easily convert the edge_node to iterators of the in_edge_list and
+ * out_edge_list and remove them from the lists.
+ *
+ * vertex_descriptor and edge_descriptor structures both just wrap pointers to
+ * the relevant node structure along with the serial number. operator<() for the
+ * descriptors is overridden to look at the serial member of the node.
+ * We do not use:
+ *    - the address of the node structure as this would lead to an unstable
+ *      ordering of vertices between runs.
+ *    - the index field as this would mean that the generation of new index
+ *      values (during say renumbering of vertex nodes after removing some
+ *      vertices) would potentially reorder vertices and corrupt containers
+ *      such as std::set<>.
+ * The serial number is copied into the descriptors so that we can still have
+ * descriptors in a container (such as set or unordered_set) after removing the
+ * underlying node.
+ *
+ * Hashing of descriptors is based on the serial field for similar reasons.
+ *
+ *
+ *
+ * Main differences from boost::adjacency_list<> with listS:
+ *
+ * (1) Deterministic ordering for vertices and edges
+ *     boost::adjacency_list<> uses pointer ordering for vertex_descriptors. As
+ *     a result, ordering of vertices and edges between runs is
+ *     non-deterministic  unless containers, etc use custom comparators.
+ *
+ * (2) Proper types for descriptors, etc.
+ *     No more void * for vertex_descriptors and trying to use it for the wrong
+ *     graph type.
+ *
+ * (3) Constant time num_edges(), num_vertices(), degree(), in_degree() and
+ *     out_degree()
+ *     std::list is meant to have constant time in C++11 ::size(), but this is
+ *     not always implemented as people want to keep ABI compatibility with
+ *     existing C++98 standard libraries (gcc 4.8). As ue2_graph_h uses
+ *     intrusive lists rather than std::list this is not an issue for us.
+ *
+ * (4) Constant time remove_edge(e, g)
+ *     ue2_graph uses boost::intrusive_lists internally so we can easily unlink
+ *     an edge from the in and out edgelist of its source and target.
+ *
+ * (5) More efficient edge(u, v, g) and remove_edge(u, v, g)
+ *     ue2_graph will check which of u and v has the smallest relevant degree
+ *     and use that to search for the edge(s).
+ *
+ * (6) Automatically populate the index field of vertex and edge bundles.
+ *     Saves us from doing it manually. Naturally there is nothing to prevent
+ *     the user from stuffing up the index properties later.
+ *
+ * (7) Different edge iteration order
+ *     ue2_graph does not maintain an explicit global edge list, so the
+ *     edge_iterator is constructed out of vertex_iterator and
+ *     out_edge_iterators by iterating the out_edges of each vertices. This
+ *     means that edge iteration order is not insertion order like for
+ *     adjacency_list.
+ *
+ * (8) null_edge()
+ *     Because why not?
+ *
+ * (9) vertex and edge properties must have an index field.
+ *     We generally need them so the effort has not been put into specialising
+ *     for when they are not present.
+ *
+ *
+ *
+ * Possible Future Work:
+ *
+ * (1) Improve edge(u, v, g) performance
+ *     This function sees a fair amount of use and is O(n) in the smallest of
+ *     the source out_degree or target in_degree. This could be improved by
+ *     changes on of the edge containers to be something similar to a multiset.
+ *
+ * (2) 'Lie' about the number of edges / vertices
+ *
+ *     One of the main uses of num_edges() and num_vertices() is to allocate a
+ *     vector, etc so that it can be indexed by edge or vertex index. If
+ *     num_edges() and num_vertices() returned the appropriate size for such a
+ *     vector (at least one more than the largest index), we would be able to
+ *     avoid some renumbering operations. Functions would have to be provided to
+ *     get the real number of vertices and edges. Having num_vertices() and
+ *     num_edges() return an over-estimate is not without precedence in the BGL
+ *     - the filtered_graph adaptor does the same thing and is compatible with
+ *     various (all?) BGL algorithms. It is not clear that this was done
+ *     deliberately for the same reason or because it is difficult for
+ *     filtered_graph to get the true counts.
+ *
+ * (3) Investigate slab/pooled allocation schemes for nodes.
+ */
+
+namespace ue2 {
+
+namespace graph_detail {
+
+class graph_base : boost::noncopyable {
+};
+
+struct default_edge_property {
+    size_t index;
+};
+
+struct default_vertex_property {
+    size_t index;
+};
+
+}
+
+template<typename Graph,
+         typename VertexPropertyType = graph_detail::default_vertex_property,
+         typename EdgePropertyType = graph_detail::default_edge_property>
+class ue2_graph : graph_detail::graph_base {
+private:
+    struct in_edge_tag { };
+    struct out_edge_tag { };
+
+    struct vertex_node;
+
+    using out_edge_hook
+       = boost::intrusive::list_base_hook<boost::intrusive::tag<out_edge_tag> >;
+
+    /* in_edge_hook does not use safe mode as during graph destruction we do not
+     * maintain the in edge lists */
+    using in_edge_hook
+       = boost::intrusive::list_base_hook<boost::intrusive::tag<in_edge_tag>,
+                   boost::intrusive::link_mode<boost::intrusive::normal_link> >;
+
+    struct edge_node : public out_edge_hook, public in_edge_hook {
+        explicit edge_node(u64a serial_in) : serial(serial_in) { }
+
+        vertex_node *source = nullptr;
+        vertex_node *target = nullptr;
+        const u64a serial; /*< used to order edges. We do not use props.index so
+                            * that there is no danger of invalidating sets or
+                            * other containers by changing the index due to
+                            * renumbering */
+        EdgePropertyType props;
+    };
+
+    template<typename hook_type> using vertex_edge_list
+        = boost::intrusive::list<edge_node,
+                                 boost::intrusive::base_hook<hook_type> >;
+
+    struct vertex_node : public boost::intrusive::list_base_hook<> {
+        explicit vertex_node(u64a serial_in) : serial(serial_in) { }
+
+        VertexPropertyType props;
+        const u64a serial; /*< used to order vertices. We do not use props.index
+                            * so that there is no danger of invalidating sets or
+                            * other containers by changing the index due to
+                            * renumbering */
+
+        /* The incoming edges are not considered owned by the vertex */
+        vertex_edge_list<in_edge_hook> in_edge_list;
+
+        /* The out going edges are considered owned by the vertex and
+         * need to be freed when the graph is begin destroyed */
+        vertex_edge_list<out_edge_hook> out_edge_list;
+
+        /* The destructor only frees memory owned by the vertex and will leave
+         * the neighbour's edges in a bad state. If a vertex is being removed
+         * (rather than the graph being destroyed), then the more gentle clean
+         * up of clear_vertex() is required to be called first */
+        ~vertex_node() {
+            out_edge_list.clear_and_dispose(delete_disposer());
+        }
+    };
+
+    struct delete_disposer {
+        template<typename T> void operator()(const T *d) const { delete d; }
+    };
+
+    struct in_edge_disposer {
+        void operator()(edge_node *e) const {
+            /* remove from source's out edge list before deleting */
+            vertex_node *u = e->source;
+            u->out_edge_list.erase(u->out_edge_list.iterator_to(*e));
+            delete e;
+        }
+    };
+
+    struct out_edge_disposer {
+        void operator()(edge_node *e) const {
+            /* remove from target's in edge list before deleting */
+            vertex_node *v = e->target;
+            v->in_edge_list.erase(v->in_edge_list.iterator_to(*e));
+            delete e;
+        }
+    };
+
+    using vertices_list_type
+        = boost::intrusive::list<vertex_node,
+             boost::intrusive::base_hook<boost::intrusive::list_base_hook<> > >;
+
+    vertices_list_type vertices_list;
+
+protected: /* to allow renumbering */
+    static const size_t N_SPECIAL_VERTICES = 0; /* override in derived class */
+    size_t next_vertex_index = 0;
+    size_t next_edge_index = 0;
+
+private:
+    size_t graph_edge_count = 0; /* maintained explicitly as we have no global
+                                    edge list */
+
+    u64a next_serial = 0;
+    u64a new_serial() {
+        u64a serial = next_serial++;
+        if (!next_serial) {
+            /* if we have created enough graph edges/vertices to overflow a u64a
+             * we must have spent close to an eternity adding to this graph so
+             * something must have gone very wrong and we will not be producing
+             * a final bytecode in a reasonable amount of time. Or, more likely,
+             * the next_serial value has become corrupt. */
+            throw std::overflow_error("too many graph edges/vertices created");
+        }
+        return serial;
+    }
+public:
+    using vertices_size_type = typename vertices_list_type::size_type;
+    using degree_size_type
+        = typename vertex_edge_list<out_edge_hook>::size_type;
+    using edges_size_type = size_t;
+
+    using vertex_property_type = VertexPropertyType;
+    using edge_property_type = EdgePropertyType;
+
+    using graph_bundled = boost::no_property;
+    using vertex_bundled = VertexPropertyType;
+    using edge_bundled = EdgePropertyType;
+
+    class vertex_descriptor : boost::totally_ordered<vertex_descriptor> {
+    public:
+        vertex_descriptor() : p(nullptr), serial(0) { }
+        explicit vertex_descriptor(vertex_node *pp)
+            : p(pp), serial(pp->serial) { }
+
+        operator bool() const { return p; }
+        bool operator<(const vertex_descriptor b) const {
+            if (p && b.p) {
+                 /* no vertices in the same graph can have the same serial */
+                assert(p == b.p || serial != b.serial);
+                return serial < b.serial;
+            } else {
+                return p < b.p;
+            }
+        }
+        bool operator==(const vertex_descriptor b) const {
+            return p == b.p;
+        }
+
+        friend size_t hash_value(vertex_descriptor v) {
+            using boost::hash_value;
+            return hash_value(v.serial);
+        }
+
+    private:
+        vertex_node *raw(void) { return p; }
+        vertex_node *p;
+        u64a serial;
+        friend ue2_graph;
+    };
+
+    class edge_descriptor : boost::totally_ordered<edge_descriptor> {
+    public:
+        edge_descriptor() : p(nullptr), serial(0) { }
+        explicit edge_descriptor(edge_node *pp) : p(pp), serial(pp->serial) { }
+
+        /* Convenice ctor to allow us to directly get an edge_descriptor from
+         * edge() and add_edge(). As we have null_edges and we always allow
+         * parallel edges, the bool component of the return from these functions
+         * is not required. */
+        edge_descriptor(const std::pair<edge_descriptor, bool> &tup)
+            : p(tup.first.p), serial(tup.first.serial) {
+            assert(tup.second == (bool)tup.first);
+        }
+
+        operator bool() const { return p; }
+        bool operator<(const edge_descriptor b) const {
+            if (p && b.p) {
+                 /* no edges in the same graph can have the same serial */
+                assert(p == b.p || serial != b.serial);
+                return serial < b.serial;
+            } else {
+                return p < b.p;
+            }
+        }
+        bool operator==(const edge_descriptor b) const {
+            return p == b.p;
+        }
+
+        friend size_t hash_value(edge_descriptor e) {
+            using boost::hash_value;
+            return hash_value(e.serial);
+        }
+
+    private:
+        edge_node *raw(void) { return p; }
+        edge_node *p;
+        u64a serial;
+        friend ue2_graph;
+    };
+
+private:
+    /* Note: apparently, nested class templates cannot be fully specialised but
+     * they can be partially specialised. Sigh, ... */
+    template<typename BundleType, typename dummy = void>
+    struct bundle_key_type {
+    };
+
+    template<typename dummy>
+    struct bundle_key_type<VertexPropertyType, dummy> {
+        using type = vertex_descriptor;
+    };
+
+    template<typename dummy>
+    struct bundle_key_type<EdgePropertyType, dummy> {
+        using type = edge_descriptor;
+    };
+
+public:
+    class out_edge_iterator : public boost::iterator_adaptor<
+        out_edge_iterator,
+        typename vertex_edge_list<out_edge_hook>::const_iterator,
+        edge_descriptor,
+        boost::bidirectional_traversal_tag,
+        edge_descriptor> {
+        using super = typename out_edge_iterator::iterator_adaptor_;
+    public:
+        out_edge_iterator() : super() { }
+        explicit out_edge_iterator(
+            typename vertex_edge_list<out_edge_hook>::const_iterator it)
+            : super(it) { }
+        edge_descriptor dereference() const {
+            /* :( const_cast makes me sad but constness is defined by the graph
+             * parameter of bgl api calls */
+            return edge_descriptor(const_cast<edge_node *>(&*super::base()));
+        }
+    };
+
+    class in_edge_iterator : public boost::iterator_adaptor<
+        in_edge_iterator,
+        typename vertex_edge_list<in_edge_hook>::const_iterator,
+        edge_descriptor,
+        boost::bidirectional_traversal_tag,
+        edge_descriptor> {
+        using super = typename in_edge_iterator::iterator_adaptor_;
+    public:
+        in_edge_iterator() : super() { }
+        explicit in_edge_iterator(
+            typename vertex_edge_list<in_edge_hook>::const_iterator it)
+            : super(it) { }
+        edge_descriptor dereference() const {
+            /* :( const_cast makes me sad but constness is defined by the graph
+             * parameter of bgl api calls */
+            return edge_descriptor(const_cast<edge_node *>(&*super::base()));
+        }
+    };
+
+    class adjacency_iterator : public boost::iterator_adaptor<
+        adjacency_iterator,
+        out_edge_iterator,
+        vertex_descriptor,
+        boost::bidirectional_traversal_tag,
+        vertex_descriptor> {
+        using super = typename adjacency_iterator::iterator_adaptor_;
+    public:
+        adjacency_iterator(out_edge_iterator a) : super(std::move(a)) { }
+        adjacency_iterator() { }
+
+        vertex_descriptor dereference() const {
+            return vertex_descriptor(super::base()->p->target);
+        }
+    };
+
+    class inv_adjacency_iterator : public boost::iterator_adaptor<
+        inv_adjacency_iterator,
+        in_edge_iterator,
+        vertex_descriptor,
+        boost::bidirectional_traversal_tag,
+        vertex_descriptor> {
+        using super = typename inv_adjacency_iterator::iterator_adaptor_;
+    public:
+        inv_adjacency_iterator(in_edge_iterator a) : super(std::move(a)) { }
+        inv_adjacency_iterator() { }
+
+        vertex_descriptor dereference() const {
+            return vertex_descriptor(super::base()->p->source);
+        }
+    };
+
+    class vertex_iterator : public boost::iterator_adaptor<
+        vertex_iterator,
+        typename vertices_list_type::const_iterator,
+        vertex_descriptor,
+        boost::bidirectional_traversal_tag,
+        vertex_descriptor> {
+        using super = typename vertex_iterator::iterator_adaptor_;
+    public:
+        vertex_iterator() : super() { }
+        explicit vertex_iterator(typename vertices_list_type::const_iterator it)
+            : super(it) { }
+        vertex_descriptor dereference() const {
+            /* :( const_cast makes me sad but constness is defined by the graph
+             * parameter of bgl api calls */
+            return vertex_descriptor(
+                       const_cast<vertex_node *>(&*super::base()));
+        }
+    };
+
+    class edge_iterator : public boost::iterator_facade<
+        edge_iterator,
+        edge_descriptor,
+        boost::forward_traversal_tag, /* TODO: make bidi */
+        edge_descriptor> {
+    public:
+        using main_base_iter_type = vertex_iterator;
+        using aux_base_iter_type = out_edge_iterator;
+
+        edge_iterator(main_base_iter_type b, main_base_iter_type e)
+            : main(std::move(b)), main_end(std::move(e)) {
+            if (main == main_end) {
+                return;
+            }
+            std::tie(aux, aux_end) = out_edges_impl(*main);
+            while (aux == aux_end) {
+                ++main;
+                if (main == main_end) {
+                    break;
+                }
+                std::tie(aux, aux_end) = out_edges_impl(*main);
+            }
+        }
+        edge_iterator() { }
+
+        friend class boost::iterator_core_access;
+        void increment() {
+            ++aux;
+            while (aux == aux_end) {
+                ++main;
+                if (main == main_end) {
+                    break;
+                }
+                std::tie(aux, aux_end) = out_edges_impl(*main);
+            }
+        }
+        bool equal(const edge_iterator &other) const {
+            return main == other.main && (main == main_end || aux == other.aux);
+        }
+        edge_descriptor dereference() const {
+            return *aux;
+        }
+
+        main_base_iter_type main;
+        main_base_iter_type main_end;
+        aux_base_iter_type aux;
+        aux_base_iter_type aux_end;
+    };
+
+public:
+    static
+    vertex_descriptor null_vertex() { return vertex_descriptor(); }
+
+    vertex_descriptor add_vertex_impl() {
+        vertex_node *v = new vertex_node(new_serial());
+        v->props.index = next_vertex_index++;
+        vertices_list.push_back(*v);
+        return vertex_descriptor(v);
+    }
+
+    void remove_vertex_impl(vertex_descriptor v) {
+        vertex_node *vv = v.raw();
+        assert(vv->in_edge_list.empty());
+        assert(vv->out_edge_list.empty());
+        vertices_list.erase_and_dispose(vertices_list.iterator_to(*vv),
+                                        delete_disposer());
+    }
+
+    void clear_in_edges_impl(vertex_descriptor v) {
+        graph_edge_count -= v.raw()->in_edge_list.size();
+        v.raw()->in_edge_list.clear_and_dispose(in_edge_disposer());
+    }
+
+    void clear_out_edges_impl(vertex_descriptor v) {
+        graph_edge_count -= v.raw()->out_edge_list.size();
+        v.raw()->out_edge_list.clear_and_dispose(out_edge_disposer());
+    }
+
+    /* IncidenceGraph concept functions */
+
+    static
+    vertex_descriptor source_impl(edge_descriptor e) {
+        return vertex_descriptor(e.raw()->source);
+    }
+
+    static
+    vertex_descriptor target_impl(edge_descriptor e) {
+        return vertex_descriptor(e.raw()->target);
+    }
+
+    static
+    degree_size_type out_degree_impl(vertex_descriptor v) {
+        return v.raw()->out_edge_list.size();
+    }
+
+    static
+    std::pair<out_edge_iterator, out_edge_iterator>
+    out_edges_impl(vertex_descriptor v) {
+        return {out_edge_iterator(v.raw()->out_edge_list.begin()),
+                out_edge_iterator(v.raw()->out_edge_list.end())};
+    }
+
+    /* BidirectionalGraph concept functions */
+
+    static
+    degree_size_type in_degree_impl(vertex_descriptor v) {
+        return v.raw()->in_edge_list.size();
+    }
+
+    static
+    std::pair<in_edge_iterator, in_edge_iterator>
+    in_edges_impl(vertex_descriptor v) {
+        return {in_edge_iterator(v.raw()->in_edge_list.begin()),
+                in_edge_iterator(v.raw()->in_edge_list.end())};
+    }
+
+    /* Note: this is defined so that self loops are counted twice - which may or
+     * may not be what you want. Actually, you probably don't want this at
+     * all. */
+    static
+    degree_size_type degree_impl(vertex_descriptor v) {
+        return in_degree_impl(v) + out_degree_impl(v);
+    }
+
+    /* AdjacencyList concept functions */
+
+    static
+    std::pair<adjacency_iterator, adjacency_iterator>
+    adjacent_vertices_impl(vertex_descriptor v) {
+        auto out_edge_its = out_edges_impl(v);
+        return {adjacency_iterator(out_edge_its.first),
+                adjacency_iterator(out_edge_its.second)};
+    }
+
+    /* AdjacencyMatrix concept functions
+     * (Note: complexity guarantee is not met) */
+
+    std::pair<edge_descriptor, bool> edge_impl(vertex_descriptor u,
+                                               vertex_descriptor v) const {
+        if (in_degree_impl(v) < out_degree_impl(u)) {
+            for (const edge_descriptor &e : in_edges_range(v, *this)) {
+                if (source_impl(e) == u) {
+                    return {e, true};
+                }
+            }
+        } else {
+            for (const edge_descriptor &e : out_edges_range(u, *this)) {
+                if (target_impl(e) == v) {
+                    return {e, true};
+                }
+            }
+        }
+
+        return {edge_descriptor(), false};
+    }
+
+    /* Misc functions that don't actually seem to belong to a formal BGL
+       concept. */
+    static
+    edge_descriptor null_edge() { return edge_descriptor(); }
+
+    static
+    std::pair<inv_adjacency_iterator, inv_adjacency_iterator>
+    inv_adjacent_vertices_impl(vertex_descriptor v) {
+        auto in_edge_its = in_edges_impl(v);
+        return {inv_adjacency_iterator(in_edge_its.first),
+                inv_adjacency_iterator(in_edge_its.second)};
+    }
+
+    /* MutableGraph concept functions */
+
+    std::pair<edge_descriptor, bool>
+    add_edge_impl(vertex_descriptor u, vertex_descriptor v) {
+        bool added = true; /* we always allow parallel edges */
+        edge_node *e = new edge_node(new_serial());
+        e->source = u.raw();
+        e->target = v.raw();
+        e->props.index = next_edge_index++;
+
+        u.raw()->out_edge_list.push_back(*e);
+        v.raw()->in_edge_list.push_back(*e);
+
+        graph_edge_count++;
+        return {edge_descriptor(e), added};
+    }
+
+    void remove_edge_impl(edge_descriptor e) {
+        graph_edge_count--;
+
+        vertex_node *u = e.raw()->source;
+        vertex_node *v = e.raw()->target;
+
+        v->in_edge_list.erase(v->in_edge_list.iterator_to(*e.raw()));
+        u->out_edge_list.erase(u->out_edge_list.iterator_to(*e.raw()));
+
+        delete e.raw();
+    }
+
+    template<class Predicate>
+    void remove_out_edge_if_impl(vertex_descriptor v, Predicate pred) {
+        out_edge_iterator it, ite;
+        std::tie(it, ite) = out_edges_impl(v);
+        while (it != ite) {
+            auto jt = it;
+            ++it;
+            if (pred(*jt)) {
+                this->remove_edge_impl(*jt);
+            }
+        }
+    }
+
+    template<class Predicate>
+    void remove_in_edge_if_impl(vertex_descriptor v, Predicate pred) {
+        in_edge_iterator it, ite;
+        std::tie(it, ite) = in_edges_impl(v);
+        while (it != ite) {
+            auto jt = it;
+            ++it;
+            if (pred(*jt)) {
+                remove_edge_impl(*jt);
+            }
+        }
+    }
+
+    template<class Predicate>
+    void remove_edge_if_impl(Predicate pred) {
+        edge_iterator it, ite;
+        std::tie(it, ite) = edges_impl();
+        while (it != ite) {
+            auto jt = it;
+            ++it;
+            if (pred(*jt)) {
+                remove_edge_impl(*jt);
+            }
+        }
+    }
+
+private:
+    /* GCC 4.8 has bugs with lambdas in templated friend functions, so: */
+    struct source_match {
+        explicit source_match(const vertex_descriptor &uu) : u(uu) { }
+        bool operator()(edge_descriptor e) const { return source_impl(e) == u; }
+        const vertex_descriptor &u;
+    };
+
+    struct target_match {
+        explicit target_match(const vertex_descriptor &vv) : v(vv) { }
+        bool operator()(edge_descriptor e) const { return target_impl(e) == v; }
+        const vertex_descriptor &v;
+    };
+public:
+    /* Note: (u,v) variant needs to remove all (parallel) edges between (u,v).
+     *
+     * The edge_descriptor version should be strongly preferred if the
+     * edge_descriptor is available.
+     */
+    void remove_edge_impl(const vertex_descriptor &u,
+                          const vertex_descriptor &v) {
+        if (in_degree_impl(v) < out_degree_impl(u)) {
+            remove_in_edge_if_impl(v, source_match(u));
+        } else {
+            remove_out_edge_if_impl(u, target_match(v));
+        }
+    }
+
+    /* VertexListGraph concept functions */
+    vertices_size_type num_vertices_impl() const {
+        return vertices_list.size();
+    }
+
+    std::pair<vertex_iterator, vertex_iterator> vertices_impl() const {
+        return {vertex_iterator(vertices_list.begin()),
+                vertex_iterator(vertices_list.end())};
+    }
+
+    /* EdgeListGraph concept functions (aside from those in IncidenceGraph) */
+
+    edges_size_type num_edges_impl() const {
+        return graph_edge_count;
+    }
+
+    std::pair<edge_iterator, edge_iterator> edges_impl() const {
+        vertex_iterator vi, ve;
+        std::tie(vi, ve) = vertices_impl();
+
+        return {edge_iterator(vi, ve), edge_iterator(ve, ve)};
+    }
+
+    /* bundled properties functions */
+
+    vertex_property_type &operator[](vertex_descriptor v) {
+        return v.raw()->props;
+    }
+
+    const vertex_property_type &operator[](vertex_descriptor v) const {
+        return v.raw()->props;
+    }
+
+    edge_property_type &operator[](edge_descriptor e) {
+        return e.raw()->props;
+    }
+
+    const edge_property_type &operator[](edge_descriptor e) const {
+        return e.raw()->props;
+    }
+
+    /* PropertyGraph concept functions & helpers */
+
+    template<typename R, typename P_of>
+    struct prop_map : public boost::put_get_helper<R, prop_map<R, P_of> > {
+        using value_type = typename std::decay<R>::type;
+        using reference = R;
+        using key_type = typename bundle_key_type<P_of>::type;
+
+        typedef typename boost::lvalue_property_map_tag category;
+
+        prop_map(value_type P_of::*m_in) : member(m_in) { }
+
+        reference operator[](key_type k) const {
+            return k.raw()->props.*member;
+        }
+        reference operator()(key_type k) const { return (*this)[k]; }
+
+    private:
+        value_type P_of::*member;
+    };
+
+    template<typename R>
+    struct prop_map_all : public boost::put_get_helper<R, prop_map_all<R> > {
+        using value_type = typename std::decay<R>::type;
+        using reference = R;
+        using key_type = typename bundle_key_type<value_type>::type;
+
+        typedef typename boost::lvalue_property_map_tag category;
+
+        reference operator[](key_type k) const {
+            return k.raw()->props;
+        }
+        reference operator()(key_type k) const { return (*this)[k]; }
+    };
+
+    template<typename P_type, typename P_of>
+    friend
+    prop_map<P_type &, P_of> get(P_type P_of::*t, Graph &) {
+        return prop_map<P_type &, P_of>(t);
+    }
+
+    template<typename P_type, typename P_of>
+    friend
+    prop_map<const P_type &, P_of> get(P_type P_of::*t, const Graph &) {
+        return prop_map<const P_type &, P_of>(t);
+    }
+
+    /* We can't seem to use auto/decltype returns here as it seems that the
+     * templated member functions are not yet visible when the compile is
+     * evaluating the decltype for the return value. We could probably work
+     * around it by making this a dummy templated function. */
+    friend
+    prop_map<size_t &, VertexPropertyType>
+    get(boost::vertex_index_t, Graph &g) {
+        return get(&VertexPropertyType::index, g);
+    }
+
+    friend
+    prop_map<const size_t &, VertexPropertyType>
+    get(boost::vertex_index_t, const Graph &g) {
+        return get(&VertexPropertyType::index, g);
+    }
+
+    friend
+    prop_map<size_t &, EdgePropertyType>
+    get(boost::edge_index_t, Graph &g) {
+        return get(&EdgePropertyType::index, g);
+    }
+
+    friend
+    prop_map<const size_t &, EdgePropertyType>
+    get(boost::edge_index_t, const Graph &g) {
+        return get(&EdgePropertyType::index, g);
+    }
+
+    friend
+    prop_map_all<VertexPropertyType &> get(boost::vertex_all_t, Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<const VertexPropertyType &> get(boost::vertex_all_t,
+                                                 const Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<EdgePropertyType &> get(boost::edge_all_t, Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<const EdgePropertyType &> get(boost::edge_all_t,
+                                               const Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<VertexPropertyType &> get(boost::vertex_bundle_t, Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<const VertexPropertyType &> get(boost::vertex_bundle_t,
+                                                 const Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<EdgePropertyType &> get(boost::edge_bundle_t, Graph &) {
+        return {};
+    }
+
+    friend
+    prop_map_all<const EdgePropertyType &> get(boost::edge_bundle_t,
+                                               const Graph &) {
+        return {};
+    }
+
+    template<typename Prop, typename K>
+    friend
+    auto get(Prop p, Graph &g, K key) -> decltype(get(p, g)[key]) {
+        return get(p, g)[key];
+    }
+
+    template<typename Prop, typename K>
+    friend
+    auto get(Prop p, const Graph &g, K key) -> decltype(get(p, g)[key]) {
+        return get(p, g)[key];
+    }
+
+    template<typename Prop, typename K, typename V>
+    friend
+    void put(Prop p, Graph &g, K key, const V &value) {
+        get(p, g)[key] = value;
+    }
+
+    /* MutablePropertyGraph concept functions */
+
+    /* Note: add_vertex(g, vp) allocates a next index value for the vertex
+     * rather than using the index in vp. i.e., except for in rare coincidences:
+     *     g[add_vertex(g, vp)].index != vp.index
+     */
+    vertex_descriptor add_vertex_impl(const VertexPropertyType &vp) {
+        vertex_descriptor v = add_vertex_impl();
+        auto i = (*this)[v].index;
+        (*this)[v] = vp;
+        (*this)[v].index = i;
+
+        return v;
+    }
+
+    /* Note: add_edge(u, v, g, vp) allocates a next index value for the edge
+     * rather than using the index in ep. i.e., except for in rare coincidences:
+     *     g[add_edge(u, v, g, ep)].index != ep.index
+     */
+    std::pair<edge_descriptor, bool>
+    add_edge_impl(vertex_descriptor u, vertex_descriptor v,
+                  const EdgePropertyType &ep) {
+        auto e = add_edge_impl(u, v);
+        auto i = (*this)[e.first].index;
+        (*this)[e.first] = ep;
+        (*this)[e.first].index = i;
+
+        return e;
+    }
+
+    /* End MutablePropertyGraph */
+
+    /** Pack the edge index into a contiguous range [ 0, num_edges(g) ). */
+    void renumber_edges_impl() {
+        next_edge_index = 0;
+        edge_iterator it;
+        edge_iterator ite;
+        for (std::tie(it, ite) = edges_impl(); it != ite; ++it) {
+            (*this)[*it].index = next_edge_index++;
+        }
+    }
+
+    /** Pack the vertex index into a contiguous range [ 0, num_vertices(g) ).
+     *  Vertices with indices less than N_SPECIAL_VERTICES are not renumbered.
+     */
+    void renumber_vertices_impl() {
+        DEBUG_PRINTF("renumbering above %zu\n", Graph::N_SPECIAL_VERTICES);
+        next_vertex_index = Graph::N_SPECIAL_VERTICES;
+        vertex_iterator it;
+        vertex_iterator ite;
+        for (std::tie(it, ite) = vertices_impl(); it != ite; ++it) {
+            if ((*this)[*it].index < Graph::N_SPECIAL_VERTICES) {
+                continue;
+            }
+
+            (*this)[*it].index = next_vertex_index++;
+        }
+    }
+
+    /** Returns what the next allocated vertex index will be. This is an upper
+     *  on the values of index for vertices (vertex removal means that there may
+     *  be gaps). */
+    vertices_size_type vertex_index_upper_bound_impl() const {
+        return next_vertex_index;
+    }
+
+    /** Returns what the next allocated edge index will be. This is an upper on
+     *  the values of index for edges (edge removal means that there may be
+     *  gaps). */
+    vertices_size_type edge_index_upper_bound_impl() const {
+        return next_edge_index;
+    }
+
+    using directed_category = boost::directed_tag;
+    using edge_parallel_category = boost::allow_parallel_edge_tag;
+    struct traversal_category :
+        public virtual boost::bidirectional_graph_tag,
+        public virtual boost::adjacency_graph_tag,
+        public virtual boost::vertex_list_graph_tag,
+        public virtual boost::edge_list_graph_tag { };
+
+    ue2_graph() = default;
+
+    ue2_graph(ue2_graph &&old)
+    : next_vertex_index(old.next_vertex_index),
+      next_edge_index(old.next_edge_index),
+      graph_edge_count(old.graph_edge_count),
+      next_serial(old.next_serial) {
+        using std::swap;
+        swap(vertices_list, old.vertices_list);
+    }
+
+    ue2_graph &operator=(ue2_graph &&old) {
+        next_vertex_index = old.next_vertex_index;
+        next_edge_index = old.next_edge_index;
+        graph_edge_count = old.graph_edge_count;
+        next_serial = old.next_serial;
+        using std::swap;
+        swap(vertices_list, old.vertices_list);
+        return *this;
+    }
+
+    ~ue2_graph() {
+        vertices_list.clear_and_dispose(delete_disposer());
+    }
+};
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertex_descriptor>::type
+add_vertex(Graph &g) {
+    return g.add_vertex_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_vertex(typename Graph::vertex_descriptor v, Graph &g) {
+    g.remove_vertex_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+clear_in_edges(typename Graph::vertex_descriptor v, Graph &g) {
+    g.clear_in_edges_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+clear_out_edges(typename Graph::vertex_descriptor v, Graph &g) {
+    g.clear_out_edges_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+clear_vertex(typename Graph::vertex_descriptor v, Graph &g) {
+    g.clear_in_edges_impl(v);
+    g.clear_out_edges_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertex_descriptor>::type
+source(typename Graph::edge_descriptor e, const Graph &) {
+    return Graph::source_impl(e);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertex_descriptor>::type
+target(typename Graph::edge_descriptor e, const Graph &) {
+    return Graph::target_impl(e);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::degree_size_type>::type
+out_degree(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::out_degree_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::out_edge_iterator,
+              typename Graph::out_edge_iterator>>::type
+out_edges(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::out_edges_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::degree_size_type>::type
+in_degree(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::in_degree_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::in_edge_iterator,
+              typename Graph::in_edge_iterator>>::type
+in_edges(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::in_edges_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::degree_size_type>::type
+degree(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::degree_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::adjacency_iterator,
+              typename Graph::adjacency_iterator>>::type
+adjacent_vertices(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::adjacent_vertices_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::edge_descriptor, bool>>::type
+edge(typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v,
+     const Graph &g) {
+    return g.edge_impl(u, v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::inv_adjacency_iterator,
+              typename Graph::inv_adjacency_iterator>>::type
+inv_adjacent_vertices(typename Graph::vertex_descriptor v, const Graph &) {
+    return Graph::inv_adjacent_vertices_impl(v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::edge_descriptor, bool>>::type
+add_edge(typename Graph::vertex_descriptor u,
+         typename Graph::vertex_descriptor v, Graph &g) {
+    return g.add_edge_impl(u, v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_edge(typename Graph::edge_descriptor e, Graph &g) {
+    g.remove_edge_impl(e);
+}
+
+template<typename Graph, typename Iter>
+typename std::enable_if<
+    !std::is_convertible<Iter, typename Graph::edge_descriptor>::value
+    && std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_edge(Iter it, Graph &g) {
+    g.remove_edge_impl(*it);
+}
+
+template<typename Graph, typename Predicate>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_out_edge_if(typename Graph::vertex_descriptor v, Predicate pred,
+                   Graph &g) {
+    g.remove_out_edge_if_impl(v, pred);
+}
+
+template<typename Graph, typename Predicate>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_in_edge_if(typename Graph::vertex_descriptor v, Predicate pred,
+                  Graph &g) {
+    g.remove_in_edge_if_impl(v, pred);
+}
+
+template<typename Graph, typename Predicate>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_edge_if(Predicate pred, Graph &g) {
+    g.remove_edge_if_impl(pred);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+remove_edge(const typename Graph::vertex_descriptor &u,
+            const typename Graph::vertex_descriptor &v, Graph &g) {
+    g.remove_edge_impl(u, v);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertices_size_type>::type
+num_vertices(const Graph &g) {
+    return g.num_vertices_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::vertex_iterator,
+              typename Graph::vertex_iterator>>::type
+vertices(const Graph &g) {
+    return g.vertices_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::edges_size_type>::type
+num_edges(const Graph &g) {
+    return g.num_edges_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::edge_iterator,
+              typename Graph::edge_iterator>>::type
+edges(const Graph &g) {
+    return g.edges_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertex_descriptor>::type
+add_vertex(const typename Graph::vertex_property_type &vp, Graph &g) {
+    return g.add_vertex_impl(vp);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    std::pair<typename Graph::edge_descriptor, bool>>::type
+add_edge(typename Graph::vertex_descriptor u,
+         typename Graph::vertex_descriptor v,
+         const typename Graph::edge_property_type &ep, Graph &g) {
+    return g.add_edge_impl(u, v, ep);
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+renumber_edges(Graph &g) {
+    g.renumber_edges_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value>::type
+renumber_vertices(Graph &g) {
+    g.renumber_vertices_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::vertices_size_type>::type
+vertex_index_upper_bound(const Graph &g) {
+    return g.vertex_index_upper_bound_impl();
+}
+
+template<typename Graph>
+typename std::enable_if<
+    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value,
+    typename Graph::edges_size_type>::type
+edge_index_upper_bound(const Graph &g) {
+    return g.edge_index_upper_bound_impl();
+}
+
+using boost::vertex_index;
+using boost::edge_index;
+
+}
+
+namespace boost {
+
+/* Install partial specialisation of property_map - this is required for
+ * adaptors (like filtered_graph) to know the type of the property maps */
+template<typename Graph, typename Prop>
+struct property_map<Graph, Prop,
+                typename std::enable_if<
+                    std::is_base_of<ue2::graph_detail::graph_base, Graph>::value
+                 >::type > {
+    typedef decltype(get(std::declval<Prop>(),
+                         std::declval<Graph &>())) type;
+    typedef decltype(get(std::declval<Prop>(),
+                         std::declval<const Graph &>())) const_type;
+};
+
+}
+#endif
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 3c7be473..08b6a544 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -55,6 +55,29 @@ size_t maxStringSelfOverlap(const std::string &a, bool nocase);
 /// Compares two strings, returns non-zero if they're different.
 u32 cmp(const char *a, const char *b, size_t len, bool nocase);
 
+/**
+ * \brief String type that also records whether the whole string is caseful or
+ * caseless.
+ *
+ * You should use \ref ue2_literal if you need to represent a mixed-case
+ * literal.
+ */
+struct ue2_case_string {
+    ue2_case_string(std::string s_in, bool nocase_in)
+        : s(std::move(s_in)), nocase(nocase_in) {
+        if (nocase) {
+            upperString(s);
+        }
+    }
+
+    bool operator==(const ue2_case_string &other) const {
+        return s == other.s && nocase == other.nocase;
+    }
+
+    std::string s;
+    bool nocase;
+};
+
 struct ue2_literal {
 public:
     /// Single element proxy, pointed to by our const_iterator.
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 0619c7e4..3385e441 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -180,44 +180,52 @@
 #define partial_load_m384(ptr, sz) loadbytes384(ptr, sz)
 #define partial_load_m512(ptr, sz) loadbytes512(ptr, sz)
 
-#define store_compressed_u32(ptr, x, m)     storecompressed32(ptr, x, m)
-#define store_compressed_u64a(ptr, x, m)    storecompressed64(ptr, x, m)
-#define store_compressed_m128(ptr, x, m)    storecompressed128(ptr, x, m)
-#define store_compressed_m256(ptr, x, m)    storecompressed256(ptr, x, m)
-#define store_compressed_m384(ptr, x, m)    storecompressed384(ptr, x, m)
-#define store_compressed_m512(ptr, x, m)    storecompressed512(ptr, x, m)
+#define store_compressed_u32(ptr, x, m, len)  storecompressed32(ptr, x, m, len)
+#define store_compressed_u64a(ptr, x, m, len) storecompressed64(ptr, x, m, len)
+#define store_compressed_m128(ptr, x, m, len) storecompressed128(ptr, x, m, len)
+#define store_compressed_m256(ptr, x, m, len) storecompressed256(ptr, x, m, len)
+#define store_compressed_m384(ptr, x, m, len) storecompressed384(ptr, x, m, len)
+#define store_compressed_m512(ptr, x, m, len) storecompressed512(ptr, x, m, len)
 
-#define load_compressed_u32(x, ptr, m)      loadcompressed32(x, ptr, m)
-#define load_compressed_u64a(x, ptr, m)     loadcompressed64(x, ptr, m)
-#define load_compressed_m128(x, ptr, m)     loadcompressed128(x, ptr, m)
-#define load_compressed_m256(x, ptr, m)     loadcompressed256(x, ptr, m)
-#define load_compressed_m384(x, ptr, m)     loadcompressed384(x, ptr, m)
-#define load_compressed_m512(x, ptr, m)     loadcompressed512(x, ptr, m)
+#define load_compressed_u32(x, ptr, m, len)   loadcompressed32(x, ptr, m, len)
+#define load_compressed_u64a(x, ptr, m, len)  loadcompressed64(x, ptr, m, len)
+#define load_compressed_m128(x, ptr, m, len)  loadcompressed128(x, ptr, m, len)
+#define load_compressed_m256(x, ptr, m, len)  loadcompressed256(x, ptr, m, len)
+#define load_compressed_m384(x, ptr, m, len)  loadcompressed384(x, ptr, m, len)
+#define load_compressed_m512(x, ptr, m, len)  loadcompressed512(x, ptr, m, len)
 
-static really_inline void clearbit_u32(u32 *p, u32 n) {
+static really_inline
+void clearbit_u32(u32 *p, u32 n) {
     assert(n < sizeof(*p) * 8);
     *p &= ~(1U << n);
 }
-static really_inline void clearbit_u64a(u64a *p, u32 n) {
+
+static really_inline
+void clearbit_u64a(u64a *p, u32 n) {
     assert(n < sizeof(*p) * 8);
     *p &= ~(1ULL << n);
 }
+
 #define clearbit_m128(ptr, n)   (clearbit128(ptr, n))
 #define clearbit_m256(ptr, n)   (clearbit256(ptr, n))
 #define clearbit_m384(ptr, n)   (clearbit384(ptr, n))
 #define clearbit_m512(ptr, n)   (clearbit512(ptr, n))
 
-static really_inline char testbit_u32(const u32 *p, u32 n) {
-    assert(n < sizeof(*p) * 8);
-    return !!(*p & (1U << n));
+static really_inline
+char testbit_u32(u32 val, u32 n) {
+    assert(n < sizeof(val) * 8);
+    return !!(val & (1U << n));
 }
-static really_inline char testbit_u64a(const u64a *p, u32 n) {
-    assert(n < sizeof(*p) * 8);
-    return !!(*p & (1ULL << n));
+
+static really_inline
+char testbit_u64a(u64a val, u32 n) {
+    assert(n < sizeof(val) * 8);
+    return !!(val & (1ULL << n));
 }
-#define testbit_m128(ptr, n)    (testbit128(ptr, n))
-#define testbit_m256(ptr, n)    (testbit256(ptr, n))
-#define testbit_m384(ptr, n)    (testbit384(ptr, n))
-#define testbit_m512(ptr, n)    (testbit512(ptr, n))
+
+#define testbit_m128(val, n)    (testbit128(val, n))
+#define testbit_m256(val, n)    (testbit256(val, n))
+#define testbit_m384(val, n)    (testbit384(val, n))
+#define testbit_m512(val, n)    (testbit512(val, n))
 
 #endif
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 00000000..049fd368
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,19 @@
+find_package(Threads)
+
+# remove some warnings
+if(CMAKE_CXX_FLAGS MATCHES "-Wmissing-declarations" )
+    string(REPLACE "-Wmissing-declarations" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+include_directories(${PROJECT_SOURCE_DIR}/util)
+
+# add any subdir with a cmake file
+file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *)
+foreach(e ${dirents})
+    if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND
+       EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt)
+        add_subdirectory(${e})
+    endif ()
+endforeach ()
diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt
new file mode 100644
index 00000000..25a833d0
--- /dev/null
+++ b/tools/hsbench/CMakeLists.txt
@@ -0,0 +1,36 @@
+include (${CMAKE_MODULE_PATH}/sqlite3.cmake)
+
+if (NOT XCODE)
+    include_directories(SYSTEM ${SQLITE3_INCLUDE_DIRS})
+else()
+    # cmake doesn't think Xcode supports isystem
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${SQLITE3_INCLUDE_DIRS}")
+endif()
+
+CHECK_FUNCTION_EXISTS(malloc_info HAVE_MALLOC_INFO)
+CHECK_FUNCTION_EXISTS(shmget HAVE_SHMGET)
+set(HAVE_SHMGET ${HAVE_SHMGET} CACHE BOOL "shmget()")
+
+# only set these after all tests are done
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+
+
+SET(hsbench_SOURCES
+    common.h
+    data_corpus.cpp
+    data_corpus.h
+    engine_hyperscan.cpp
+    engine_hyperscan.h
+    heapstats.cpp
+    heapstats.h
+    huge.cpp
+    huge.h
+    main.cpp
+    thread_barrier.h
+    timer.h
+)
+
+add_executable(hsbench ${hsbench_SOURCES})
+target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS}
+    ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/hsbench/README.md b/tools/hsbench/README.md
new file mode 100644
index 00000000..344a6c00
--- /dev/null
+++ b/tools/hsbench/README.md
@@ -0,0 +1,8 @@
+Hyperscan Benchmarker: hsbench
+==============================
+
+The `hsbench` tool provides an easy way to measure Hyperscan's performance
+for a particular set of patterns and corpus of data to be scanned.
+
+Documentation describing its operation is available in the Tools section of the
+[Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/).
diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h
new file mode 100644
index 00000000..a4d60021
--- /dev/null
+++ b/tools/hsbench/common.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <string>
+
+enum class ScanMode { BLOCK, STREAMING, VECTORED };
+
+extern bool echo_matches;
+extern bool saveDatabases;
+extern bool loadDatabases;
+extern std::string serializePath;
+extern unsigned int somPrecisionMode;
+
+#endif // COMMON_H
diff --git a/tools/hsbench/data_corpus.cpp b/tools/hsbench/data_corpus.cpp
new file mode 100644
index 00000000..55bfe93a
--- /dev/null
+++ b/tools/hsbench/data_corpus.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "data_corpus.h"
+
+#include "util/container.h"
+#include "ue2common.h"
+
+#include <cassert>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <sqlite3.h>
+
+using namespace std;
+using namespace ue2;
+
+static
+void readRow(sqlite3_stmt *statement, vector<DataBlock> &blocks,
+             map<unsigned int, unsigned int> &stream_indices) {
+    unsigned int id = sqlite3_column_int(statement, 0);
+    unsigned int stream_id = sqlite3_column_int(statement, 1);
+    const char *blob = (const char *)sqlite3_column_blob(statement, 2);
+    unsigned int bytes = sqlite3_column_bytes(statement, 2);
+
+    if (!contains(stream_indices, stream_id)) {
+        unsigned int internal_stream_index = stream_indices.size();
+        stream_indices[stream_id] = internal_stream_index;
+    }
+    auto internal_stream_index = stream_indices[stream_id];
+
+    assert(blob || bytes > 0);
+    blocks.emplace_back(id, stream_id, internal_stream_index,
+                        string(blob, blob + bytes));
+}
+
+vector<DataBlock> readCorpus(const string &filename) {
+    int status;
+    sqlite3 *db = nullptr;
+
+    status = sqlite3_open_v2(filename.c_str(), &db, SQLITE_OPEN_READONLY,
+                             nullptr);
+
+    assert(db);
+    if (status != SQLITE_OK) {
+        ostringstream err;
+        err << "Unable to open database '" << filename << "': "
+            << sqlite3_errmsg(db);
+        status = sqlite3_close(db);
+        assert(status == SQLITE_OK);
+        throw DataCorpusError(err.str());
+    }
+
+    static const string query("SELECT id, stream_id, data "
+                              "FROM chunk ORDER BY id;");
+
+    sqlite3_stmt *statement = nullptr;
+
+    status = sqlite3_prepare_v2(db, query.c_str(), query.size(), &statement,
+                                nullptr);
+    if (status != SQLITE_OK) {
+        status = sqlite3_finalize(statement);
+        assert(status == SQLITE_OK);
+        status = sqlite3_close(db);
+        assert(status == SQLITE_OK);
+
+        ostringstream oss;
+        oss << "Query failed: " << query;
+        throw DataCorpusError(oss.str());
+    }
+
+    vector<DataBlock> blocks;
+    map<unsigned int, unsigned int> stream_indices;
+
+    status = sqlite3_step(statement);
+    while (status == SQLITE_ROW) {
+        readRow(statement, blocks, stream_indices);
+        status = sqlite3_step(statement);
+    }
+
+    if (status != SQLITE_DONE) {
+        ostringstream oss;
+        oss << "Error retrieving blocks from corpus: "
+            << sqlite3_errstr(status);
+
+        status = sqlite3_finalize(statement);
+        assert(status == SQLITE_OK);
+        status = sqlite3_close(db);
+        assert(status == SQLITE_OK);
+
+        throw DataCorpusError(oss.str());
+    }
+
+    status = sqlite3_finalize(statement);
+    assert(status == SQLITE_OK);
+    status = sqlite3_close(db);
+    assert(status == SQLITE_OK);
+
+    if (blocks.empty()) {
+        throw DataCorpusError("Database contains no blocks.");
+    }
+
+    return blocks;
+}
diff --git a/tools/hsbench/data_corpus.h b/tools/hsbench/data_corpus.h
new file mode 100644
index 00000000..91a87acc
--- /dev/null
+++ b/tools/hsbench/data_corpus.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DATACORPUS_H
+#define DATACORPUS_H
+
+#include <vector>
+#include <string>
+
+class DataBlock {
+public:
+    DataBlock(unsigned int in_id, unsigned int in_stream,
+              unsigned int int_stream_index_in, std::string in_data)
+        : id(in_id), stream_id(in_stream),
+          internal_stream_index(int_stream_index_in),
+          payload(std::move(in_data)) {}
+
+    unsigned int id;        // unique block identifier
+    unsigned int stream_id; // unique stream identifier (from corpus file)
+    unsigned int internal_stream_index; /* dense index for this stream
+                                         * (allocated by hsbench) */
+    std::string payload;    // actual block payload
+};
+
+/** Exception thrown if an error occurs. */
+class DataCorpusError {
+public:
+    explicit DataCorpusError(std::string msg_in) : msg(std::move(msg_in)) {}
+    std::string msg;
+};
+
+/**
+ * Interface to a corpus database. Any error will produce a DataCorpusError
+ * and should be considered fatal.
+ */
+std::vector<DataBlock> readCorpus(const std::string &filename);
+
+#endif // DATACORPUS_H
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
new file mode 100644
index 00000000..f5abb9fa
--- /dev/null
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ExpressionParser.h"
+#include "common.h"
+#include "engine_hyperscan.h"
+#include "expressions.h"
+#include "heapstats.h"
+#include "huge.h"
+#include "timer.h"
+
+#include "crc32.h"
+#include "database.h"
+#include "hs_compile.h"
+#include "hs_internal.h"
+#include "hs_runtime.h"
+#include "util/database_util.h"
+#include "util/make_unique.h"
+
+#include <cassert>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+EngineContext::EngineContext(const hs_database_t *db) {
+    hs_alloc_scratch(db, &scratch);
+    assert(scratch);
+}
+
+EngineContext::~EngineContext() {
+    hs_free_scratch(scratch);
+}
+
+namespace /* anonymous */ {
+
+/** Scan context structure passed to the onMatch callback function. */
+struct ScanContext {
+    ScanContext(unsigned id_in, ResultEntry &result_in,
+                const EngineStream *stream_in)
+        : id(id_in), result(result_in), stream(stream_in) {}
+    unsigned id;
+    ResultEntry &result;
+    const EngineStream *stream; // nullptr except in streaming mode.
+};
+
+} // namespace
+
+/**
+ * Callback function called for every match that Hyperscan produces, used when
+ * "echo matches" is off.
+ */
+static
+int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int,
+            void *ctx) {
+    ScanContext *sc = static_cast<ScanContext *>(ctx);
+    assert(sc);
+    sc->result.matches++;
+
+    return 0;
+}
+
+/**
+ * Callback function called for every match that Hyperscan produces when "echo
+ * matches" is enabled.
+ */
+static
+int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
+                unsigned int, void *ctx) {
+    ScanContext *sc = static_cast<ScanContext *>(ctx);
+    assert(sc);
+    sc->result.matches++;
+
+    if (sc->stream) {
+        printf("Match @%u:%u:%llu for %u\n", sc->stream->sn, sc->id, to, id);
+    } else {
+        printf("Match @%u:%llu for %u\n", sc->id, to, id);
+    }
+
+    return 0;
+}
+
+EngineHyperscan::EngineHyperscan(hs_database_t *db_in) : db(db_in) {
+    assert(db);
+}
+
+EngineHyperscan::~EngineHyperscan() {
+    release_huge(db);
+}
+
+unique_ptr<EngineContext> EngineHyperscan::makeContext() const {
+    return ue2::make_unique<EngineContext>(db);
+}
+
+void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id,
+                           ResultEntry &result, EngineContext &ctx) const {
+    assert(data);
+
+    ScanContext sc(id, result, nullptr);
+    auto callback = echo_matches ? onMatchEcho : onMatch;
+    hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc);
+
+    if (rv != HS_SUCCESS) {
+        printf("Fatal error: hs_scan returned error %d\n", rv);
+        abort();
+    }
+}
+
+void EngineHyperscan::scan_vectored(const char *const *data,
+                                    const unsigned int *len, unsigned int count,
+                                    unsigned streamId, ResultEntry &result,
+                                    EngineContext &ctx) const {
+    assert(data);
+    assert(len);
+
+    ScanContext sc(streamId, result, nullptr);
+    auto callback = echo_matches ? onMatchEcho : onMatch;
+    hs_error_t rv =
+        hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc);
+
+    if (rv != HS_SUCCESS) {
+        printf("Fatal error: hs_scan_vector returned error %d\n", rv);
+        abort();
+    }
+}
+
+unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ctx,
+                                                     unsigned streamId) const {
+    auto stream = ue2::make_unique<EngineStream>();
+    stream->ctx = &ctx;
+
+    hs_open_stream(db, 0, &stream->id);
+    if (!stream->id) {
+        // an error occurred, propagate to caller
+        return nullptr;
+    }
+    stream->sn = streamId;
+    return stream;
+}
+
+void EngineHyperscan::streamClose(unique_ptr<EngineStream> stream,
+                                  ResultEntry &result) const {
+    assert(stream);
+
+    auto &s = static_cast<EngineStream &>(*stream);
+    EngineContext &ctx = *s.ctx;
+
+    ScanContext sc(0, result, &s);
+    auto callback = echo_matches ? onMatchEcho : onMatch;
+
+    assert(s.id);
+    hs_close_stream(s.id, ctx.scratch, callback, &sc);
+    s.id = nullptr;
+}
+
+void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
+                                 unsigned len, unsigned id,
+                                 ResultEntry &result) const {
+    assert(data);
+
+    auto &s = static_cast<EngineStream &>(stream);
+    EngineContext &ctx = *s.ctx;
+
+    ScanContext sc(id, result, &s);
+    auto callback = echo_matches ? onMatchEcho : onMatch;
+    hs_error_t rv =
+        hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc);
+
+    if (rv != HS_SUCCESS) {
+        printf("Fatal error: hs_scan_stream returned error %d\n", rv);
+        abort();
+    }
+}
+
+static
+unsigned makeModeFlags(ScanMode scan_mode) {
+    switch (scan_mode) {
+    case ScanMode::BLOCK:
+        return HS_MODE_BLOCK;
+    case ScanMode::STREAMING:
+        return HS_MODE_STREAM;
+    case ScanMode::VECTORED:
+        return HS_MODE_VECTORED;
+    }
+    assert(0);
+    return HS_MODE_STREAM;
+}
+
+/**
+ * Hash the settings used to compile a database, returning a string that can be
+ * used as a filename.
+ */
+static
+string dbSettingsHash(const string &filename, u32 mode) {
+    ostringstream info_oss;
+
+    info_oss << filename.c_str() << ' ';
+    info_oss << mode << ' ';
+
+    string info = info_oss.str();
+
+    u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size());
+
+    // return STL string with printable version of digest
+    ostringstream oss;
+    oss << hex << setw(8) << setfill('0') << crc << dec;
+
+    return oss.str();
+}
+
+static
+string dbFilename(const std::string &name, unsigned mode) {
+    ostringstream oss;
+    oss << serializePath << '/' << dbSettingsHash(name, mode) << ".db";
+    return oss.str();
+}
+
+std::unique_ptr<EngineHyperscan>
+buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
+                     const std::string &name, UNUSED const ue2::Grey &grey) {
+    if (expressions.empty()) {
+        assert(0);
+        return nullptr;
+    }
+
+    long double compileSecs = 0.0;
+    size_t compiledSize = 0.0;
+    size_t streamSize = 0;
+    size_t scratchSize = 0;
+    unsigned int peakMemorySize = 0;
+    unsigned int crc = 0;
+    std::string db_info;
+
+    unsigned int mode = makeModeFlags(scan_mode);
+
+    hs_database_t *db;
+    hs_error_t err;
+
+    if (loadDatabases) {
+        db = loadDatabase(dbFilename(name, mode).c_str());
+        if (!db) {
+            return nullptr;
+        }
+    } else {
+        const unsigned int count = expressions.size();
+
+        vector<string> exprs;
+        vector<unsigned int> flags, ids;
+        vector<hs_expr_ext> ext;
+
+        for (const auto &m : expressions) {
+            string expr;
+            unsigned int f = 0;
+            hs_expr_ext extparam;
+            extparam.flags = 0;
+            if (!readExpression(m.second, expr, &f, &extparam)) {
+                printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(),
+                       m.first);
+                return nullptr;
+            }
+
+            exprs.push_back(expr);
+            ids.push_back(m.first);
+            flags.push_back(f);
+            ext.push_back(extparam);
+        }
+
+        unsigned full_mode = mode;
+        if (mode == HS_MODE_STREAM) {
+            full_mode |= somPrecisionMode;
+        }
+
+        // Our compiler takes an array of plain ol' C strings.
+        vector<const char *> patterns(count);
+        for (unsigned int i = 0; i < count; i++) {
+            patterns[i] = exprs[i].c_str();
+        }
+
+        // Extended parameters are passed as pointers to hs_expr_ext structures.
+        vector<const hs_expr_ext *> ext_ptr(count);
+        for (unsigned int i = 0; i < count; i++) {
+            ext_ptr[i] = &ext[i];
+        }
+
+        Timer timer;
+        timer.start();
+
+        hs_compile_error_t *compile_err;
+
+#ifndef RELEASE_BUILD
+        err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(),
+                                   ext_ptr.data(), count, full_mode, nullptr,
+                                   &db, &compile_err, grey);
+#else
+        err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.data(),
+                                   ext_ptr.data(), count, full_mode, nullptr,
+                                   &db, &compile_err);
+#endif
+
+        timer.complete();
+        compileSecs = timer.seconds();
+        peakMemorySize = getPeakHeap();
+
+        if (err == HS_COMPILER_ERROR) {
+            if (compile_err->expression >= 0) {
+                printf("Compile error for signature #%u: %s\n",
+                       compile_err->expression, compile_err->message);
+            } else {
+                printf("Compile error: %s\n", compile_err->message);
+            }
+            hs_free_compile_error(compile_err);
+            return nullptr;
+        }
+    }
+
+    // copy the db into huge pages (where available) to reduce TLB pressure
+    db = get_huge(db);
+    if (!db) {
+        return nullptr;
+    }
+
+    err = hs_database_size(db, &compiledSize);
+    if (err != HS_SUCCESS) {
+        return nullptr;
+    }
+    assert(compiledSize > 0);
+
+    crc = db->crc32;
+
+    if (saveDatabases) {
+        saveDatabase(db, dbFilename(name, mode).c_str());
+    }
+
+    if (mode & HS_MODE_STREAM) {
+        err = hs_stream_size(db, &streamSize);
+        if (err != HS_SUCCESS) {
+            return nullptr;
+        }
+    } else {
+        streamSize = 0;
+    }
+
+    char *info;
+    err = hs_database_info(db, &info);
+    if (err != HS_SUCCESS) {
+        return nullptr;
+    } else {
+        db_info = string(info);
+        free(info);
+    }
+
+    // Allocate scratch temporarily to find its size: this is a good test
+    // anyway.
+    hs_scratch_t *scratch = nullptr;
+    err = hs_alloc_scratch(db, &scratch);
+    if (err != HS_SUCCESS) {
+        return nullptr;
+    }
+
+    err = hs_scratch_size(scratch, &scratchSize);
+    if (err != HS_SUCCESS) {
+        return nullptr;
+    }
+    hs_free_scratch(scratch);
+
+    // Output summary information.
+    printf("Signatures:        %s\n", name.c_str());
+    printf("Hyperscan info:    %s\n", db_info.c_str());
+    printf("Expression count:  %'zu\n", expressions.size());
+    printf("Bytecode size:     %'zu bytes\n", compiledSize);
+    printf("Database CRC:      0x%x\n", crc);
+    if (mode & HS_MODE_STREAM) {
+        printf("Stream state size: %'zu bytes\n", streamSize);
+    }
+    printf("Scratch size:      %'zu bytes\n", scratchSize);
+    printf("Compile time:      %'0.3Lf seconds\n", compileSecs);
+    printf("Peak heap usage:   %'u bytes\n", peakMemorySize);
+
+    return ue2::make_unique<EngineHyperscan>(db);
+}
diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h
new file mode 100644
index 00000000..7875decc
--- /dev/null
+++ b/tools/hsbench/engine_hyperscan.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ENGINEHYPERSCAN_H
+#define ENGINEHYPERSCAN_H
+
+#include "expressions.h"
+#include "common.h"
+#include "hs_runtime.h"
+
+#include <memory>
+
+/** Structure for the result of a single complete scan. */
+struct ResultEntry {
+    double seconds = 0;       //!< Time taken for scan.
+    unsigned int matches = 0; //!< Count of matches found.
+};
+
+/** Engine context which is allocated on a per-thread basis. */
+class EngineContext {
+public:
+    explicit EngineContext(const hs_database_t *db);
+    ~EngineContext();
+
+    hs_scratch_t *scratch = nullptr;
+};
+
+/** Streaming mode scans have persistent stream state associated with them. */
+class EngineStream {
+public:
+    hs_stream_t *id;
+    unsigned int sn;
+    EngineContext *ctx;
+};
+
+/** Hyperscan Engine for scanning data. */
+class EngineHyperscan {
+public:
+    explicit EngineHyperscan(hs_database_t *db);
+    ~EngineHyperscan();
+
+    std::unique_ptr<EngineContext> makeContext() const;
+
+    void scan(const char *data, unsigned int len, unsigned int id,
+              ResultEntry &result, EngineContext &ctx) const;
+
+    void scan_vectored(const char *const *data, const unsigned int *len,
+                       unsigned int count, unsigned int streamId,
+                       ResultEntry &result, EngineContext &ctx) const;
+
+    std::unique_ptr<EngineStream> streamOpen(EngineContext &ctx,
+                                             unsigned id) const;
+
+    void streamClose(std::unique_ptr<EngineStream> stream,
+                     ResultEntry &result) const;
+
+    void streamScan(EngineStream &stream, const char *data, unsigned int len,
+                    unsigned int id, ResultEntry &result) const;
+
+private:
+    hs_database_t *db;
+};
+
+namespace ue2 {
+struct Grey;
+}
+
+std::unique_ptr<EngineHyperscan>
+buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
+                     const std::string &name, const ue2::Grey &grey);
+
+#endif // ENGINEHYPERSCAN_H
diff --git a/tools/hsbench/heapstats.cpp b/tools/hsbench/heapstats.cpp
new file mode 100644
index 00000000..d0dffdb3
--- /dev/null
+++ b/tools/hsbench/heapstats.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Peak heap usage code.
+ *
+ * At present, we only have an implementation for modern glibc systems, using
+ * the malloc_info() call. We return zero elsewhere.
+ */
+
+#include "config.h"
+
+#include "heapstats.h"
+
+#if defined HAVE_MALLOC_INFO
+
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <malloc.h>
+
+size_t getPeakHeap(void) {
+    FILE *tmpf = tmpfile();
+    if (!tmpf) {
+        return 0;
+    }
+
+    int rv = malloc_info(0, tmpf);
+    if (rv != 0) {
+        fclose(tmpf);
+        return 0;
+    }
+
+    rewind(tmpf);
+
+    // We don't want to depend on a real XML parser. This is ugly and brittle
+    // and hopefully good enough for the time being. We look for the last
+    // system tag with type max, which should be the malloc-wide one.
+
+    static const char begin[] = "<system type=\"max\" size=\"";
+    const size_t begin_len = strlen(begin);
+
+    char *line = nullptr;
+    size_t len = 0, maxheap = 0;
+    ssize_t read;
+
+    while ((read = getline(&line, &len, tmpf)) != -1) {
+        if (strncmp(line, begin, begin_len) == 0) {
+            errno = 0;
+            maxheap = (size_t)strtoull(line + begin_len, nullptr, 10);
+            if (errno != 0) {
+                goto finish;
+            }
+        }
+    }
+
+finish:
+    free(line);
+    fclose(tmpf);
+    return maxheap;
+}
+
+#elif defined __linux
+
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+using namespace std;
+
+size_t getPeakHeap(void) {
+    // Modern Linux kernels write a 'VmPeak' value into /proc/$PID/status. This
+    // is a reasonable approximation, though it likely includes shared libs and
+    // the like as well...
+    ostringstream path;
+    path << "/proc/" << getpid() << "/status";
+
+    ifstream f(path.str().c_str());
+    if (!f.good()) {
+        return 0;
+    }
+
+    const string vmpeak("VmPeak:");
+
+    string line;
+    while (getline(f, line)) {
+        istringstream iss(line, istringstream::in);
+        string word;
+        iss >> word;
+        if (word != vmpeak) {
+            continue;
+        }
+
+        // Skip spaces
+        while (iss.good() && !isdigit(iss.peek())) {
+            iss.ignore();
+        }
+
+        size_t num = 0;
+        iss >> num;
+        return num * 1024;
+    }
+
+    f.close();
+    return 0;
+}
+
+#else
+
+// Stub.
+size_t getPeakHeap(void) {
+    return 0;
+}
+
+#endif
diff --git a/src/util/fatbit.c b/tools/hsbench/heapstats.h
similarity index 91%
rename from src/util/fatbit.c
rename to tools/hsbench/heapstats.h
index a80c3165..c2c37998 100644
--- a/src/util/fatbit.c
+++ b/tools/hsbench/heapstats.h
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "fatbit.h"
-#include "multibit.h"
+#ifndef HEAPSTATS_H
+#define HEAPSTATS_H
 
-u32 fatbit_size(u32 total_bits) {
-    return MAX(sizeof(struct fatbit), mmbit_size(total_bits));
-}
+#include <cstddef> // for size_t
+
+size_t getPeakHeap(void);
+
+#endif
diff --git a/tools/hsbench/huge.cpp b/tools/hsbench/huge.cpp
new file mode 100644
index 00000000..dbb453b2
--- /dev/null
+++ b/tools/hsbench/huge.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "hs.h"
+#include "ue2common.h"
+
+#include "common.h"
+#include "huge.h"
+
+#ifndef _WIN32
+#include <cstdio>
+#include <cstring>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#if defined(HAVE_SHMGET)
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+
+UNUSED static int hsdb_shmid;
+
+using namespace std;
+
+long gethugepagesize(void);
+
+hs_database_t *get_huge(hs_database_t *db) {
+#if defined(HAVE_SHMGET) && defined(SHM_HUGETLB)
+    /* move the database to huge pages where possible, but fail politely */
+    hs_error_t err;
+    size_t len;
+    char *bytes;
+
+    long hpage_size = gethugepagesize();
+    if (hpage_size < 0) {
+        printf("Couldn't determine huge page size\n");
+        hsdb_shmid = -1;
+        return db;
+    }
+
+    err = hs_serialize_database(db, &bytes, &len);
+    if (err != HS_SUCCESS) {
+        printf("Failed to serialize database for copy: %d\n", err);
+        // this is weird - don't fail gracefully this time
+        return nullptr;
+    }
+
+    size_t size;
+    err = hs_serialized_database_size(bytes, len, &size);
+    if (err != HS_SUCCESS) {
+        printf("Failed to get database size: %d\n", err);
+        // this is weird - don't fail gracefully this time
+        return nullptr;
+    }
+
+    void *shmaddr;
+    if ((hsdb_shmid = shmget(IPC_PRIVATE, ROUNDUP_N(size, gethugepagesize()),
+                SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+        // This could fail if the user doesn't have permission to shmget(),
+        // which is OK.
+        goto fini;
+    }
+
+    shmaddr = shmat(hsdb_shmid, nullptr, SHM_RND);
+    if (shmaddr == (char *)-1) {
+        perror("Shared memory attach failure");
+        goto fini;
+    }
+
+    // Mark this segment to be destroyed after this process detaches.
+    shmctl(hsdb_shmid, IPC_RMID, nullptr);
+
+    err = hs_deserialize_database_at(bytes, len, (hs_database_t *)shmaddr);
+    if (err != HS_SUCCESS) {
+        printf("Failed to deserialize database into shm: %d\n", err);
+        shmdt((const void *)shmaddr);
+        goto fini;
+    }
+
+    free(bytes);
+    hs_free_database(db);
+    return (hs_database_t *)shmaddr;
+
+fini:
+    free(bytes);
+    hsdb_shmid = -1;
+    return db;
+#else
+    return db;
+#endif
+}
+
+void release_huge(hs_database_t *db) {
+#if defined(HAVE_SHMGET) && defined(SHM_HUGETLB)
+    if (hsdb_shmid != -1) {
+        if (shmdt((const void *)db) != 0) {
+            perror("Detach failure");
+        }
+    } else {
+        // fallback
+        hs_free_database(db);
+    }
+#else
+    hs_free_database(db);
+#endif
+}
+
+#define BUF_SIZE 4096
+static long read_meminfo(const char *tag) {
+    int fd;
+    char buf[BUF_SIZE];
+    int len;
+    char *p, *q;
+    long val;
+
+    fd = open("/proc/meminfo", O_RDONLY);
+    if (fd < 0) {
+        perror("Couldn't open /proc/meminfo");
+        return -1;
+    }
+
+    len = read(fd, buf, sizeof(buf));
+    close(fd);
+    if (len < 0) {
+        perror("Error reading /proc/meminfo");
+        return -1;
+    }
+    if (len == sizeof(buf)) {
+        printf("/proc/meminfo is too large\n");
+        return -1;
+    }
+    buf[len] = '\0';
+
+    p = strstr(buf, tag);
+    if (!p) {
+        return -1;
+    }
+
+    p += strlen(tag);
+    val = strtol(p, &q, 0);
+    if (!isspace(*q)) {
+        printf("Couldn't parse /proc/meminfo value\n");
+        return -1;
+    }
+
+    return val;
+}
+
+long gethugepagesize(void) {
+    long hpage_size;
+    int hpage_kb;
+
+    hpage_kb = read_meminfo("Hugepagesize:");
+    if (hpage_kb < 0) {
+        hpage_size = -1;
+    } else {
+        /* convert from kb to bytes */
+        hpage_size = 1024 * hpage_kb;
+    }
+
+    return hpage_size;
+}
+
+#else
+
+/* No huge page support on WIN32. */
+
+hs_database_t *get_huge(hs_database_t *db) { return db; }
+
+void release_huge(hs_database_t *db) { hs_free_database(db); }
+
+#endif
diff --git a/tools/hsbench/huge.h b/tools/hsbench/huge.h
new file mode 100644
index 00000000..da539bd6
--- /dev/null
+++ b/tools/hsbench/huge.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HUGE_H
+#define HUGE_H
+
+#include "hs.h"
+
+hs_database_t *get_huge(hs_database_t *db);
+void release_huge(hs_database_t *db);
+
+#endif /* HUGE_H */
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
new file mode 100644
index 00000000..4298963b
--- /dev/null
+++ b/tools/hsbench/main.cpp
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common.h"
+#include "data_corpus.h"
+#include "engine_hyperscan.h"
+#include "expressions.h"
+#include "thread_barrier.h"
+#include "timer.h"
+#include "util/expression_path.h"
+#include "util/string_util.h"
+
+#include "grey.h"
+#include "hs.h"
+#include "ue2common.h"
+#include "util/make_unique.h"
+
+#include <algorithm>
+#include <clocale>
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
+#include <map>
+#include <numeric>
+#include <sstream>
+#include <set>
+#include <thread>
+
+#include <getopt.h>
+#ifndef _WIN32
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+#include <boost/core/noncopyable.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using namespace ue2;
+using boost::adaptors::map_keys;
+
+// Globals common to all files.
+bool echo_matches = false;
+bool saveDatabases = false;
+bool loadDatabases = false;
+string serializePath("");
+unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
+
+namespace /* anonymous */ {
+
+// Globals local to this file.
+bool display_per_scan = false;
+ScanMode scan_mode = ScanMode::STREAMING;
+unsigned repeats = 20;
+string exprPath("");
+string corpusFile("");
+vector<unsigned int> threadCores;
+Timer totalTimer;
+double totalSecs = 0;
+
+typedef void (*thread_func_t)(void *context);
+
+class ThreadContext : boost::noncopyable {
+public:
+    ThreadContext(unsigned num_in, const EngineHyperscan &db_in,
+                  thread_barrier &tb_in, thread_func_t function_in,
+                  vector<DataBlock> corpus_data_in)
+        : num(num_in), results(repeats), engine(db_in),
+          enginectx(db_in.makeContext()), corpus_data(move(corpus_data_in)),
+          tb(tb_in), function(function_in) {}
+
+    // Start the thread.
+    bool start(int cpu) {
+        thr = thread(function, this);
+
+        // affine if it's asked for
+        if (cpu >= 0) {
+            return affine(cpu);
+        }
+        return true;
+    }
+
+    // Wait for the thread to exit.
+    void join() {
+        thr.join();
+    }
+
+    // Serialise all threads on a global barrier.
+    void barrier() {
+        tb.wait();
+    }
+
+    // Apply processor affinity (if available) to this thread.
+    bool affine(UNUSED int cpu) {
+#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        assert(cpu >= 0 && cpu < CPU_SETSIZE);
+
+        // The 'clang' compiler complains about an unused result here, so we
+        // silence it.
+        (void)CPU_SET(cpu, &cpuset);
+
+        int rv = pthread_setaffinity_np(thr.native_handle(), sizeof(cpuset),
+                                        &cpuset);
+        return (rv == 0);
+#endif
+        return false;  // not available
+    }
+
+    unsigned num;
+    Timer timer;
+    vector<ResultEntry> results;
+    const EngineHyperscan &engine;
+    unique_ptr<EngineContext> enginectx;
+    vector<DataBlock> corpus_data;
+
+protected:
+    thread_barrier &tb; // shared barrier for time sync
+    thread_func_t function;
+    thread thr;
+};
+
+/** Display usage information, with an optional error. */
+static
+void usage(const char *error) {
+    printf("Usage: hsbench [OPTIONS...]\n\n");
+    printf("Options:\n\n");
+    printf("  -h              Display help and exit.\n");
+    printf("  -G OVERRIDES    Overrides for the grey box.\n");
+    printf("  -e PATH         Path to expression directory.\n");
+    printf("  -s FILE         Signature file to use.\n");
+    printf("  -z NUM          Signature ID to use.\n");
+    printf("  -c FILE         File to use as corpus.\n");
+    printf("  -n NUMBER       Repeat scan NUMBER times (default 20).\n");
+    printf("  -N              Benchmark in block mode"
+           " (default: streaming).\n");
+    printf("  -V              Benchmark in vectored mode"
+           " (default: streaming).\n");
+    printf("  -T CPU,CPU,...  Benchmark with threads on these CPUs.\n");
+    printf("  -i DIR          Don't compile, load from files in DIR"
+           " instead.\n");
+    printf("  -w DIR          After compiling, save to files in DIR.\n");
+    printf("  -d NUMBER       Set SOM precision mode (default: 8 (large)).\n");
+    printf("\n");
+    printf("  --per-scan      Display per-scan Mbit/sec results.\n");
+    printf("  --echo-matches  Display all matches that occur during scan.\n");
+    printf("\n\n");
+
+    if (error) {
+        printf("Error: %s\n", error);
+    }
+}
+
+/** Wraps up a name and the set of signature IDs it refers to. */
+struct BenchmarkSigs {
+    BenchmarkSigs(string name_in, SignatureSet sigs_in)
+        : name(move(name_in)), sigs(move(sigs_in)) {}
+    string name;
+    SignatureSet sigs;
+};
+
+/** Process command-line arguments. Prints usage and exits on error. */
+static
+void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
+                 UNUSED Grey &grey) {
+    const char options[] = "-b:c:Cd:e:G:hi:n:No:p:sT:Vw:z:";
+    int in_sigfile = 0;
+    int do_per_scan = 0;
+    int do_echo_matches = 0;
+    vector<string> sigFiles;
+
+    static struct option longopts[] = {
+        {"per-scan", 0, &do_per_scan, 1},
+        {"echo-matches", 0, &do_echo_matches, 1},
+        {nullptr, 0, nullptr, 0}
+    };
+
+    for (;;) {
+        int c = getopt_long(argc, argv, options, longopts, nullptr);
+        if (c < 0) {
+            break;
+        }
+        switch (c) {
+        case 'c':
+            corpusFile.assign(optarg);
+            break;
+        case 'd': {
+            unsigned dist;
+            if (!fromString(optarg, dist)) {
+                usage("Must provide an integer argument to '-d' flag");
+                exit(1);
+            }
+            switch (dist) {
+            case 2:
+                somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
+                break;
+            case 4:
+                somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
+                break;
+            case 8:
+                somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
+                break;
+            default:
+                usage("SOM precision must be 2, 4 or 8");
+                exit(1);
+            }
+            break;
+        }
+        case 'e':
+            exprPath.assign(optarg);
+            break;
+#ifndef RELEASE_BUILD
+        case 'G':
+            applyGreyOverrides(&grey, string(optarg));
+            break;
+#endif
+        case 'h':
+            usage(nullptr);
+            exit(0);
+            break;
+        case 'n':
+            if (!fromString(optarg, repeats) || repeats == 0) {
+                usage("Couldn't parse argument to -n flag, should be"
+                      " a positive integer.");
+                exit(1);
+            }
+            break;
+        case 's':
+            in_sigfile = 2;
+            break;
+        case 'N':
+            scan_mode = ScanMode::BLOCK;
+            break;
+        case 'V':
+            scan_mode = ScanMode::VECTORED;
+            break;
+        case 'T':
+            if (!strToList(optarg, threadCores)) {
+                usage("Couldn't parse argument to -T flag, should be"
+                      " a list of positive integers.");
+                exit(1);
+            }
+            break;
+        case 'z': {
+            unsigned int sinumber;
+            if (!fromString(optarg, sinumber)) {
+                usage("Argument to '-z' flag must be an integer");
+                exit(1);
+            }
+            SignatureSet sigs = {sinumber};
+            sigSets.emplace_back(string("-z ") + optarg, sigs);
+            break;
+        }
+        case 'i':
+            loadDatabases = true;
+            serializePath = optarg;
+            break;
+        case 'w':
+            saveDatabases = true;
+            serializePath = optarg;
+            break;
+        case 1:
+            if (in_sigfile) {
+                sigFiles.push_back(optarg);
+                in_sigfile = 2;
+                break;
+            }
+        case 0:
+            break;
+        default:
+            usage("Unrecognised command line argument.");
+            exit(1);
+        }
+
+        if (in_sigfile) {
+            in_sigfile--;
+        }
+    }
+
+    if (do_echo_matches) {
+        echo_matches = true;
+    }
+    if (do_per_scan) {
+        display_per_scan = true;
+    }
+
+    if (exprPath.empty() && !sigFiles.empty()) {
+        /* attempt to infer an expression directory */
+        auto si = sigFiles.begin();
+        exprPath = inferExpressionPath(*si);
+        for (++si; si != sigFiles.end(); ++si) {
+            if (exprPath != inferExpressionPath(*si)) {
+                usage("Unable to infer consistent expression directory");
+                exit(1);
+            }
+        }
+    }
+
+    // Must have a valid expression path
+    if (exprPath.empty()) {
+        usage("Must specify an expression path with the -e option.");
+        exit(1);
+    }
+
+    // Must have valid database to scan
+    if (corpusFile.empty()) {
+        usage("Must specify a corpus file with the -c option.");
+        exit(1);
+    }
+
+    // Cannot ask for both loading and saving
+    if (loadDatabases && saveDatabases) {
+        usage("You cannot both load and save databases.");
+        exit(1);
+    }
+
+    // Read in any -s signature sets.
+    for (const auto &file : sigFiles) {
+        SignatureSet sigs;
+        loadSignatureList(file, sigs);
+        sigSets.emplace_back(file, move(sigs));
+    }
+}
+
+/** Start the global timer. */
+static
+void startTotalTimer(ThreadContext *ctx) {
+    if (ctx->num != 0) {
+        return; // only runs in the first thread
+    }
+    totalTimer.start();
+}
+
+/** Stop the global timer and calculate totals. */
+static
+void stopTotalTimer(ThreadContext *ctx) {
+    if (ctx->num != 0) {
+        return; // only runs in the first thread
+    }
+    totalTimer.complete();
+    totalSecs = totalTimer.seconds();
+}
+
+/** Run a benchmark over a given engine and corpus in block mode. */
+static
+void benchBlock(void *context) {
+    ThreadContext *ctx = (ThreadContext *)context;
+
+    // Synchronization point
+    ctx->barrier();
+
+    startTotalTimer(ctx);
+
+    for (ResultEntry &r : ctx->results) {
+        ctx->timer.start();
+
+        for (const DataBlock &block : ctx->corpus_data) {
+            ctx->engine.scan(block.payload.c_str(), block.payload.size(),
+                             block.id, r, *ctx->enginectx);
+        }
+
+        ctx->timer.complete();
+        r.seconds = ctx->timer.seconds();
+    }
+
+    // Synchronization point
+    ctx->barrier();
+
+    // Now that all threads are finished, we can stop the clock.
+    stopTotalTimer(ctx);
+}
+
+/** Structure used to represent a stream. */
+struct StreamInfo {
+    unsigned int stream_id = ~0U;
+    unsigned int first_block_id = ~0U;
+    unsigned int last_block_id = 0;
+    unique_ptr<EngineStream> eng_handle;
+};
+
+static
+u64a count_streams(const vector<DataBlock> &corpus_blocks) {
+    set<unsigned int> streams;
+    for (const DataBlock &block : corpus_blocks) {
+        streams.insert(block.stream_id);
+    }
+
+    return (u64a)streams.size();
+}
+
+/**
+ * Take a ThreadContext and prepare a vector<StreamDataBlock> for streaming mode
+ * scanning from it.
+ */
+static
+vector<StreamInfo> prepStreamingData(const ThreadContext *ctx) {
+    vector<StreamInfo> info(count_streams(ctx->corpus_data));
+    for (const DataBlock &block : ctx->corpus_data) {
+        assert(block.internal_stream_index < info.size());
+        StreamInfo &si = info[block.internal_stream_index];
+
+        /* check if this is the first time we have encountered this stream */
+        if (si.first_block_id > si.last_block_id) {
+            si.stream_id = block.stream_id;
+            si.first_block_id = block.id;
+            si.last_block_id = block.id;
+        } else {
+            assert(block.stream_id == si.stream_id);
+            assert(block.id > si.last_block_id);
+            assert(block.id > si.first_block_id);
+            si.last_block_id = block.id;
+        }
+    }
+    return info;
+}
+
+static
+void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams) {
+    assert(ctx);
+    const EngineHyperscan &e = ctx->engine;
+    const vector<DataBlock> &blocks = ctx->corpus_data;
+
+    for (ResultEntry &r : ctx->results) {
+        ctx->timer.start();
+
+        for (const auto &b : blocks) {
+            StreamInfo &stream = streams[b.internal_stream_index];
+            assert(stream.stream_id == b.stream_id);
+
+            // If this is the first block in the stream, open the stream
+            // handle.
+            if (b.id == stream.first_block_id) {
+                assert(!stream.eng_handle);
+                stream.eng_handle = e.streamOpen(*ctx->enginectx, b.stream_id);
+                if (!stream.eng_handle) {
+                    printf("Fatal error: stream open failed!\n");
+                    exit(1);
+                }
+            }
+
+            assert(stream.eng_handle);
+
+            e.streamScan(*stream.eng_handle, b.payload.c_str(),
+                         b.payload.size(), b.id, r);
+
+            // if this was the last block in the stream, close the stream handle
+            if (b.id == stream.last_block_id) {
+                e.streamClose(move(stream.eng_handle), r);
+                stream.eng_handle = nullptr;
+            }
+        }
+
+        ctx->timer.complete();
+        r.seconds = ctx->timer.seconds();
+    }
+}
+
+/** Run a benchmark over a given engine and corpus in streaming mode. */
+static
+void benchStreaming(void *context) {
+    ThreadContext *ctx = (ThreadContext *)context;
+    vector<StreamInfo> streams = prepStreamingData(ctx);
+
+    // Synchronization point
+    ctx->barrier();
+
+    startTotalTimer(ctx);
+
+    benchStreamingInternal(ctx, streams);
+
+    // Synchronization point
+    ctx->barrier();
+
+    // Now that all threads are finished, we can stop the clock.
+    stopTotalTimer(ctx);
+}
+
+/** In-memory structure for a data block to be scanned in vectored mode. */
+struct VectoredInfo {
+    vector<const char *> data;
+    vector<unsigned int> len;
+    unsigned int stream_id;
+};
+
+/**
+ * Take a ThreadContext and prepare a vector<VectoredInfo> for vectored mode
+ * scanning from it.
+ */
+static
+vector<VectoredInfo> prepVectorData(const ThreadContext *ctx) {
+    vector<VectoredInfo> out(count_streams(ctx->corpus_data));
+    for (const DataBlock &block : ctx->corpus_data) {
+        VectoredInfo &vi = out[block.internal_stream_index];
+        if (vi.data.empty()) {
+            vi.stream_id = block.stream_id;
+        } else {
+            assert(vi.stream_id == block.stream_id);
+        }
+        vi.data.push_back(block.payload.c_str());
+        vi.len.push_back(block.payload.size());
+    }
+
+    return out;
+}
+
+/** Run a benchmark over a given engine and corpus in vectored mode. */
+static
+void benchVectored(void *context) {
+    ThreadContext *ctx = (ThreadContext *)context;
+
+    vector<VectoredInfo> v_plans = prepVectorData(ctx);
+
+    // Synchronization point
+    ctx->barrier();
+
+    startTotalTimer(ctx);
+
+    for (ResultEntry &r : ctx->results) {
+        ctx->timer.start();
+
+        for (const VectoredInfo &v_plan : v_plans) {
+            ctx->engine.scan_vectored(&v_plan.data[0], &v_plan.len[0],
+                                      v_plan.data.size(), v_plan.stream_id, r,
+                                      *ctx->enginectx);
+        }
+
+        ctx->timer.complete();
+        r.seconds = ctx->timer.seconds();
+    }
+
+    // Synchronization point
+    ctx->barrier();
+
+    // Now that all threads are finished, we can stop the clock.
+    stopTotalTimer(ctx);
+}
+
+/** Given a time and a size, compute the throughput in megabits/sec. */
+static
+long double calc_mbps(double seconds, u64a bytes) {
+    assert(seconds > 0);
+    return (long double)bytes / ((long double)seconds * 125000);
+}
+
+/** Dump per-scan throughput data to screen. */
+static
+void displayPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
+                           u64a bytesPerRun) {
+    for (const auto &t : threads) {
+        const auto &results = t->results;
+        for (size_t j = 0; j != results.size(); j++) {
+            const auto &r = results[j];
+            double mbps = calc_mbps(r.seconds, bytesPerRun);
+            printf("T %2u Scan %2zu: %'0.2f Mbit/sec\n", t->num, j, mbps);
+        }
+    }
+    printf("\n");
+}
+
+static
+u64a byte_size(const vector<DataBlock> &corpus_blocks) {
+    u64a total = 0;
+    for (const DataBlock &block : corpus_blocks) {
+        total += block.payload.size();
+    }
+
+    return total;
+}
+
+/** Dump benchmark results to screen. */
+static
+void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
+                    const vector<DataBlock> &corpus_blocks) {
+    u64a bytesPerRun = byte_size(corpus_blocks);
+    u64a matchesPerRun = threads[0]->results[0].matches;
+
+    // Sanity check: all of our results should have the same match count.
+    for (const auto &t : threads) {
+        if (!all_of(begin(t->results), end(t->results),
+                    [&matchesPerRun](const ResultEntry &e) {
+                        return e.matches == matchesPerRun;
+                    })) {
+            printf("\nWARNING: PER-SCAN MATCH COUNTS ARE INCONSISTENT!\n\n");
+            break;
+        }
+    }
+
+    printf("Time spent scanning:     %'0.3f seconds\n", totalSecs);
+    printf("Corpus size:             %'llu bytes ", bytesPerRun);
+    switch (scan_mode) {
+    case ScanMode::STREAMING:
+        printf("(%'zu blocks in %'llu streams)\n", corpus_blocks.size(),
+               count_streams(corpus_blocks));
+        break;
+    case ScanMode::VECTORED:
+        printf("(%'zu blocks in %'llu vectors)\n", corpus_blocks.size(),
+               count_streams(corpus_blocks));
+        break;
+    case ScanMode::BLOCK:
+        printf("(%'zu blocks)\n", corpus_blocks.size());
+        break;
+    }
+
+    u64a totalBytes = bytesPerRun * repeats * threads.size();
+    u64a totalBlocks = corpus_blocks.size() * repeats * threads.size();
+
+    double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun;
+    printf("Matches per iteration:   %'llu (%'0.3f matches/kilobyte)\n",
+           matchesPerRun, matchRate);
+
+    double blockRate = (double)totalBlocks / (double)totalSecs;
+    printf("Overall block rate:      %'0.2f blocks/sec\n", blockRate);
+    printf("Overall throughput:      %'0.2Lf Mbit/sec\n",
+           calc_mbps(totalSecs, totalBytes));
+    printf("\n");
+
+    if (display_per_scan) {
+        displayPerScanResults(threads, bytesPerRun);
+    }
+}
+
+/**
+ * Construct a thread context for this scanning mode.
+ *
+ * Note: does not take blocks by reference. This is to give every thread their
+ * own copy of the data. It would be unrealistic for every thread to be scanning
+ * the same copy of the data.
+ */
+static
+unique_ptr<ThreadContext> makeThreadContext(const EngineHyperscan &db,
+                                            const vector<DataBlock> &blocks,
+                                            unsigned id,
+                                            thread_barrier &sync_barrier) {
+    thread_func_t fn = nullptr;
+    switch (scan_mode) {
+    case ScanMode::STREAMING:
+        fn = benchStreaming;
+        break;
+    case ScanMode::VECTORED:
+        fn = benchVectored;
+        break;
+    case ScanMode::BLOCK:
+        fn = benchBlock;
+        break;
+    }
+    assert(fn);
+
+    return ue2::make_unique<ThreadContext>(id, db, sync_barrier, fn, blocks);
+}
+
+/** Run the given benchmark. */
+static
+void runBenchmark(const EngineHyperscan &db,
+                  const vector<DataBlock> &corpus_blocks) {
+    size_t numThreads;
+    bool useAffinity = false;
+
+    if (threadCores.empty()) {
+        numThreads = 1;
+    } else {
+        numThreads = threadCores.size();
+#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
+        useAffinity = true;
+#else
+        useAffinity = false;
+#endif
+    }
+
+    // Initialise a barrier that will let us sync threads before/after scanning
+    // for timer measurements.
+    thread_barrier sync_barrier(numThreads);
+
+    vector<unique_ptr<ThreadContext>> threads;
+
+    for (unsigned i = 0; i < numThreads; i++) {
+        auto t = makeThreadContext(db, corpus_blocks, i, sync_barrier);
+        int core = useAffinity ? (int)threadCores[i] : -1;
+        if (!t->start(core)) {
+            printf("Unable to start processing thread %u\n", i);
+            exit(1);
+        }
+        threads.push_back(move(t));
+    }
+
+    // Reap threads.
+    for (auto &t : threads) {
+        t->join();
+    }
+
+    // Display global results.
+    displayResults(threads, corpus_blocks);
+}
+
+} // namespace
+
+/** Main driver. */
+int main(int argc, char *argv[]) {
+    Grey grey;
+
+    setlocale(LC_ALL, ""); // use the user's locale
+
+#ifndef NDEBUG
+    printf("\nWARNING: DO NOT BENCHMARK A HYPERSCAN BUILD WITH ASSERTIONS\n\n");
+#endif
+
+    vector<BenchmarkSigs> sigSets;
+    processArgs(argc, argv, sigSets, grey);
+
+    // read in and process our expressions
+    ExpressionMap exprMapTemplate;
+    loadExpressions(exprPath, exprMapTemplate);
+
+    // If we have no signature sets, the user wants us to benchmark all the
+    // known expressions together.
+    if (sigSets.empty()) {
+        SignatureSet sigs;
+        for (auto i : exprMapTemplate | map_keys) {
+            sigs.push_back(i);
+        }
+        sigSets.emplace_back(exprPath, move(sigs));
+    }
+
+    // read in and process our corpus
+    vector<DataBlock> corpus_blocks;
+    try {
+        corpus_blocks = readCorpus(corpusFile);
+    } catch (const DataCorpusError &e) {
+        printf("Corpus data error: %s\n", e.msg.c_str());
+        return 1;
+    }
+
+    for (const auto &s : sigSets) {
+        ExpressionMap exprMap = exprMapTemplate; // copy
+
+        limitBySignature(exprMap, s.sigs);
+        if (exprMap.empty()) {
+            continue;
+        }
+
+        auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, grey);
+        if (!engine) {
+            printf("Error: expressions failed to compile.\n");
+            exit(1);
+        }
+
+        printf("\n");
+
+        runBenchmark(*engine, corpus_blocks);
+    }
+
+    return 0;
+}
diff --git a/tools/hsbench/scripts/CorpusBuilder.py b/tools/hsbench/scripts/CorpusBuilder.py
new file mode 100755
index 00000000..5baed2bd
--- /dev/null
+++ b/tools/hsbench/scripts/CorpusBuilder.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python
+
+'''
+A module to construct corpora databases for the Hyperscan benchmarker
+(hsbench).
+
+After construction, simply add blocks with the add_chunk() method, then call
+finish() when you're done.
+'''
+
+import os.path
+
+try:
+    from sqlite3 import dbapi2 as sqlite
+except:
+    from pysqlite2 import dbapi2 as sqlite
+
+class CorpusBuilder:
+    SCHEMA = '''
+CREATE TABLE chunk (
+    id integer primary key,
+    stream_id integer not null,
+    data blob
+);
+'''
+
+    def __init__(self, outfile):
+        if os.path.exists(outfile):
+            raise RuntimeError("Database '%s' already exists" % outfile)
+        self.outfile = outfile
+        self.db = sqlite.connect(self.outfile)
+        self.db.executescript(CorpusBuilder.SCHEMA)
+        self.current_chunk_id = 0;
+
+    def add_chunk(self, stream_id, data):
+        chunk_id = self.current_chunk_id;
+        c = self.db.cursor()
+        q = 'insert into chunk (id, stream_id, data) values (?, ?, ?)'
+        c.execute(q, (chunk_id, stream_id, sqlite.Binary(data)))
+        self.current_chunk_id += 1
+        return chunk_id
+
+    def finish(self):
+        self.db.commit()
+
+        c = self.db.cursor()
+        q = 'create index chunk_stream_id_idx on chunk(stream_id)'
+        c.execute(q)
+
+        c = self.db.cursor()
+        q = 'vacuum'
+        c.execute(q)
+
+        c = self.db.cursor()
+        q = 'analyze'
+        c.execute(q)
+
+        self.db.commit()
diff --git a/tools/hsbench/scripts/gutenbergCorpus.py b/tools/hsbench/scripts/gutenbergCorpus.py
new file mode 100755
index 00000000..fa1b1570
--- /dev/null
+++ b/tools/hsbench/scripts/gutenbergCorpus.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python
+
+'''
+This script creates a Hyperscan benchmarking corpus database from a supplied
+group of Project Gutenberg texts.
+'''
+
+import sys, getopt, os.path
+import gutenberg.acquire, gutenberg.cleanup, gutenberg.query
+from CorpusBuilder import CorpusBuilder
+
+stream_id = 0
+stream_bytes = 0
+
+def addBlocks(builder, block_size, stream_size, text_id, text):
+    global stream_id
+    global stream_bytes
+
+    print "text", text_id, "len", len(text)
+    i = 0
+    while i < len(text):
+        chunk = text[i:min(len(text), i + block_size)]
+        builder.add_chunk(stream_id, chunk)
+        i += block_size
+        stream_bytes += len(chunk)
+        if stream_bytes >= stream_size:
+            stream_id += 1
+            stream_bytes = 0
+    print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes."
+
+def buildCorpus(outFN, block_size, stream_size, text_ids):
+    if len(text_ids) == 0:
+        print >>sys.stderr, "Must provide at least one input ID"
+        sys.exit(0)
+
+    builder = CorpusBuilder(outFN)
+
+    total_bytes = 0
+    stream_id = 0
+    stream_bytes = 0
+
+    for text_id in text_ids:
+        text_id = int(text_id)
+        text = gutenberg.acquire.load_etext(text_id)
+        text = gutenberg.cleanup.strip_headers(text).strip()
+        addBlocks(builder, block_size, stream_size, text_id, text)
+        total_bytes += len(text)
+
+    builder.finish()
+
+    print "Total:", total_bytes, "bytes."
+
+def usage(exeName):
+    errmsg = "Usage: %s -o <output file> -b <block size> -s <max stream size> <gutenberg text id>..."
+    errmsg = errmsg % exeName
+    print >> sys.stderr, errmsg
+    sys.exit(-1)
+
+if __name__ == '__main__':
+    opts, args = getopt.getopt(sys.argv[1:], 'o:b:s:')
+    opts = dict(opts)
+
+    requiredKeys = [ '-o', '-b', '-s' ]
+    for k in requiredKeys:
+        if not opts.has_key(k):
+            usage(os.path.basename(sys.argv[0]))
+
+    buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args)
diff --git a/tools/hsbench/scripts/linebasedCorpus.py b/tools/hsbench/scripts/linebasedCorpus.py
new file mode 100755
index 00000000..bde20e39
--- /dev/null
+++ b/tools/hsbench/scripts/linebasedCorpus.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+
+'''
+Simple script to take a file full of lines of text and push them into a
+Hyperscan benchmarking corpus database, one block per line.
+'''
+
+import sys, getopt, os.path
+from CorpusBuilder import CorpusBuilder
+
+def lineCorpus(inFN, outFN):
+    '''
+    Read lines from file name @inFN and write them as blocks to a new db with
+    name @outFN.
+    '''
+
+    if not os.path.exists(inFN):
+        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN
+        sys.exit(-1)
+
+    lines = open(inFN).readlines()
+
+    if len(lines) == 0:
+        print >> sys.stderr, "Input file contained no lines. Exiting."
+        sys.exit(0)
+
+    builder = CorpusBuilder(outFN)
+
+    # write a single stream to contain everything
+    streamId = 0
+
+    for l in lines:
+        builder.add_chunk(streamId, l.rstrip())
+
+    builder.finish()
+
+def usage(exeName):
+    errmsg = "Usage: %s -i <input file> -o <output file>"
+    errmsg = errmsg % exeName
+    print >> sys.stderr, errmsg
+    sys.exit(-1)
+
+if __name__ == '__main__':
+    args = getopt.getopt(sys.argv[1:], 'i:o:c:')
+    args = dict(args[0])
+
+    requiredKeys = [ '-i', '-o' ]
+    for k in requiredKeys:
+        if not args.has_key(k):
+            usage(os.path.basename(sys.argv[0]))
+
+    fnArgs = tuple([args[k] for k in requiredKeys])
+    lineCorpus(*fnArgs)
diff --git a/tools/hsbench/scripts/pcapCorpus.py b/tools/hsbench/scripts/pcapCorpus.py
new file mode 100755
index 00000000..c10bfef3
--- /dev/null
+++ b/tools/hsbench/scripts/pcapCorpus.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+
+'''
+Script to convert a pcap file containing UDP and TCP packets to a corpus file.
+'''
+
+import sys, getopt, pprint, os
+from sqlite3 import dbapi2 as sqlite
+import pcap
+from optparse import OptionParser
+from socket import AF_INET, IPPROTO_UDP, IPPROTO_TCP, inet_ntop, ntohs, ntohl, inet_ntoa
+import struct
+from CorpusBuilder import CorpusBuilder
+
+ETHERTYPE_IP        = 0x0800    # IP protocol
+ETHERTYPE_ARP       = 0x0806    # Addr. resolution protocol
+ETHERTYPE_REVARP    = 0x8035    # reverse Addr. resolution protocol
+ETHERTYPE_VLAN      = 0x8100    # IEEE 802.1Q VLAN tagging
+ETHERTYPE_IPV6      = 0x86dd    # IPv6
+
+#
+# A dictionary of active TCP streams
+#
+tcp_streams = {}
+
+#
+# A dictionary of UDP streams
+#
+udp_streams = {}
+
+#
+# Current stream id
+cur_stream_id = 0
+
+def usage(exeName) :
+    errmsg = "Usage: %s -i <pcap-file> -o <sqlite-file>"
+    errmsg = errmsg % exeName
+    print >> sys.stderr, errmsg
+    sys.exit(-1)
+
+class FiveTuple(object):
+    def __init__(self, protocol, src_addr, src_port, dst_addr, dst_port):
+        self.protocol = protocol
+        self.src_addr = src_addr
+        self.src_port = src_port
+        self.dst_addr = dst_addr
+        self.dst_port = dst_port
+
+    def __str__(self):
+        return "%d,%s,%d,%s,%d" % (self.protocol, self.src_addr, self.src_port, self.dst_addr, self.dst_port)
+
+class UdpSegment:
+    """Definition of a UDP segment
+    """
+    def __init__(self, five_tuple, header, payload):
+        self.five_tuple = five_tuple
+        self.udp_header = header
+        self.udp_payload = payload
+
+class TcpSegment:
+    """Definition of a TCP segment
+    """
+    def __init__(self, five_tuple, header, payload):
+        self.five_tuple = five_tuple
+        self.tcp_header = header
+        self.tcp_payload = payload
+        self.tcp_sequence_number, self.tcp_acknowledgement_number = struct.unpack('!LL', header[4:12])
+
+    def opt_isset_FIN(self):
+        opts = ord(self.tcp_header[13]) & 0x3F
+        return (opts & 0x01)
+
+    def opt_isset_SYN(self):
+        opts = ord(self.tcp_header[13]) & 0x3F
+        return (opts & 0x02)
+
+    def get_sequence_number(self):
+        return self.tcp_sequence_number
+
+    def __cmp__(self, other):
+        return cmp(self.tcp_sequence_number, other.tcp_sequence_number)
+
+class TcpStream:
+    """Definition of a TCP stream.
+    """
+    TCP_STREAM_ACTIVE = 0x1
+    TCP_STREAM_CLOSED = 0x02
+
+    def __init__(self, five_tuple):
+        self.five_tuple = five_tuple
+        self.initial_sequence_number = 0
+        self.segments = []
+
+    def reset_stream(self):
+        self.segments = []
+        self.initial_sequence_number = 0
+
+    def set_initial_sequence_number(self, sequence_number):
+        self.initial_sequence_number = sequence_number
+
+    def append_segment(self, tcp_segment):
+        if len(self.segments) == 0:
+             self.set_initial_sequence_number(tcp_segment.get_sequence_number())
+        self.segments.append(tcp_segment)
+
+    def get_segments_sorted(self):
+        return sorted(self.segments)
+
+class UdpStream:
+    """A container for UDP packets that share the same 5-tuple
+    """
+    def __init__(self, five_tuple):
+        self.five_tuple = five_tuple
+        self.segments = []
+
+    def append_segment(self, udp_segment):
+        self.segments.append(udp_segment)
+
+
+def newStream(five_tuple):
+    '''
+    Create a new stream using the arguments passed-in and return its ID.
+    '''
+    global cur_stream_id
+    stream_id = cur_stream_id
+    cur_stream_id += 1
+    return stream_id
+
+def process_tcp_segment(builder, segment):
+    """Process a tcp segment. It checks for SYN and FIN segments are
+    if set modifies the associated stream.
+    """
+    segment_id = str(segment.five_tuple)
+    if segment_id in tcp_streams:
+        m_tcp_stream = tcp_streams[segment_id]
+        m_tcp_stream.append_segment(segment)
+    else:
+        m_tcp_stream = TcpStream(segment.five_tuple)
+        m_tcp_stream.append_segment(segment)
+        tcp_streams[segment_id] = m_tcp_stream
+
+
+    if segment.opt_isset_SYN():
+        m_tcp_stream.segments = []
+
+    if segment.opt_isset_FIN():
+        #
+        # Finished with the stream - add the segments in the
+        # stream to db allowing the stream to be reused.
+        #
+        db_add_tcp_stream_segments(builder, m_tcp_stream)
+        del tcp_streams[segment_id]
+
+def process_udp_segment(builder, segment):
+    """ Process a UDP segment. Given the connectionless nature of the UDP
+    protocol we simple accumulate the segment for later processing
+    when all the packets have been read
+    """
+    segment_id = str(segment.five_tuple)
+    if segment_id in udp_streams:
+        m_udp_stream = udp_streams[segment_id]
+        m_udp_stream.append_segment(segment)
+    else:
+        m_udp_stream = UdpStream(segment.five_tuple)
+        m_udp_stream.append_segment(segment)
+        udp_streams[segment_id] = m_udp_stream
+
+
+def db_add_tcp_stream_segments(builder, tcp_stream):
+    """Add the contents of a tcp stream to the database
+    """
+    tcp_segments = tcp_stream.get_segments_sorted()
+    last_sequence_num = 0
+    streamID = None
+
+    for tcp_segment in tcp_segments:
+        if (len(tcp_segment.tcp_payload) > 0) and (tcp_segment.tcp_sequence_number > last_sequence_num):
+            #
+            # Segment with an actual payload - add it to the stream's
+            # list of chunks.
+            #
+            # Note: delay creating the stream until we have a via chunk to
+            # commit to it
+            #
+            if streamID == None:
+                streamID = newStream(tcp_stream.five_tuple)
+            builder.add_chunk(streamID, tcp_segment.tcp_payload)
+            last_sequence_num =  tcp_segment.tcp_sequence_number
+
+
+def db_add_udp_stream_segments(builder, udp_stream):
+    """Add the contents of a UDP stream to the database. Since UDP is
+    connection-less, a UDP stream object is really just an accumulation
+    of all the packets associated with a given 5-tuple.
+    """
+    udp_segments = udp_stream.segments
+    streamID = None
+    for udp_segment in udp_segments:
+        if len(udp_segment.udp_payload) > 0:
+            if streamID == None:
+                streamID = newStream(udp_stream.five_tuple)
+            builder.add_chunk(streamID, udp_segment.udp_payload)
+
+def enchunk_pcap(pcapFN, sqliteFN):
+    """Read the contents of a pcap file with name @pcapFN and produce
+    a sqlite db with name @sqliteFN. It will contain chunks of data
+    from TCP and UDP streams,
+    """
+
+    if not os.path.exists(pcapFN):
+        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN
+        sys.exit(-1)
+
+    builder = CorpusBuilder(sqliteFN)
+
+    #
+    # Read in the contents of the pcap file, adding stream segments as found
+    #
+    pkt_cnt = 0;
+    ip_pkt_cnt = 0;
+    unsupported_ip_protocol_cnt = 0
+    pcap_ref = pcap.pcap(pcapFN)
+    done = False
+
+    while not done:
+        try:
+            ts, packet = pcap_ref.next()
+        except:
+            break
+
+        pkt_cnt += 1
+
+        linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
+        if linkLayerType != ETHERTYPE_IP:
+            #
+            # We're only interested in IP packets
+            #
+            continue
+
+        ip_pkt_cnt += 1
+
+        ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0]
+        ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
+        pkt_protocol = struct.unpack('B', ip_pkt[9])[0]
+
+        if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):
+            #
+            # we're only interested in UDP and TCP packets at the moment
+            #
+            continue
+
+        pkt_src_addr = inet_ntoa(ip_pkt[12:16])
+        pkt_dst_addr = inet_ntoa(ip_pkt[16:20])
+
+        ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4
+        ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)]
+
+        pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4])
+        five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port)
+        five_tuple_id = str(five_tuple)
+
+        if pkt_protocol == IPPROTO_UDP:
+            udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8
+            udp_header = ip_payload[0:8]
+            udp_payload = ip_payload[8:len(ip_payload)]
+            udp_segment = UdpSegment(five_tuple, udp_header, udp_payload)
+            process_udp_segment(builder, udp_segment)
+        elif pkt_protocol == IPPROTO_TCP:
+            tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4
+            tcp_header = ip_payload[0:tcp_hdr_len]
+            tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)]
+            segment = TcpSegment(five_tuple, tcp_header, tcp_payload)
+            process_tcp_segment(builder, segment)
+
+    #
+    # Having read the contents of the pcap, we fill the database with any
+    # remaining TCP and UDP segments
+    #
+    for tcp_stream in tcp_streams.itervalues():
+        db_add_tcp_stream_segments(builder, tcp_stream)
+
+    for udp_stream in udp_streams.itervalues():
+        db_add_udp_stream_segments(builder, udp_stream)
+
+    #
+    # We've finished with the database
+    #
+    builder.finish()
+
+if __name__ == '__main__' :
+
+    args = getopt.getopt(sys.argv[1:], 'i:o:')
+    args = dict(args[0])
+
+    requiredKeys = [ '-i', '-o']
+    for k in requiredKeys :
+        if not args.has_key(k) :
+            usage(os.path.basename(sys.argv[0]))
+
+    fnArgs = tuple([ args[k] for k in requiredKeys ])
+    enchunk_pcap(*fnArgs)
diff --git a/tools/hsbench/thread_barrier.h b/tools/hsbench/thread_barrier.h
new file mode 100644
index 00000000..1c3a53e7
--- /dev/null
+++ b/tools/hsbench/thread_barrier.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Simple thread barrier.
+ */
+
+#ifndef TOOLS_THREAD_BARRIER_H
+#define TOOLS_THREAD_BARRIER_H
+
+#include <condition_variable>
+#include <mutex>
+
+/**
+ * \brief Simple thread barrier class.
+ *
+ * Blocks until wait() has been called N times.
+ */
+class thread_barrier {
+public:
+    explicit thread_barrier(unsigned int n) : max(n) {
+        if (max == 0) {
+            throw std::runtime_error("invalid barrier");
+        }
+    }
+
+    void wait() {
+        std::unique_lock<std::mutex> lock(mtx);
+        count++;
+        if (count >= max) {
+            count = 0;
+            condvar.notify_all();
+        } else {
+            condvar.wait(lock);
+        }
+    }
+
+private:
+    std::mutex mtx;
+    std::condition_variable condvar;
+    unsigned int count = 0;
+    unsigned int max;
+};
+
+#endif // TOOLS_THREAD_BARRIER_H
diff --git a/tools/hsbench/timer.h b/tools/hsbench/timer.h
new file mode 100644
index 00000000..85bd294c
--- /dev/null
+++ b/tools/hsbench/timer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TIMER_H
+#define TIMER_H
+
+#include "ue2common.h"
+
+#include <chrono>
+
+class Timer {
+public:
+    Timer() = default;
+
+    void start() {
+        clock_start = Clock::now();
+    }
+
+    void complete() {
+        clock_end = Clock::now();
+    }
+
+    double seconds() const {
+        std::chrono::duration<double> secs = clock_end - clock_start;
+        return secs.count();
+    }
+
+protected:
+    using Clock = std::chrono::steady_clock;
+    std::chrono::time_point<Clock> clock_start;
+    std::chrono::time_point<Clock> clock_end;
+};
+
+#endif // TIMER_H
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index 63f3a9ac..8b494444 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -34,7 +34,7 @@ add_library(gtest STATIC ${gtest_SOURCES})
 
 add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 
-if (NOT RELEASE_BUILD)
+if (NOT (RELEASE_BUILD OR FAT_RUNTIME))
 set(unit_internal_SOURCES
     internal/bitfield.cpp
     internal/bitutils.cpp
@@ -71,6 +71,7 @@ set(unit_internal_SOURCES
     internal/repeat.cpp
     internal/rose_build_merge.cpp
     internal/rose_mask.cpp
+    internal/rose_mask_32.cpp
     internal/rvermicelli.cpp
     internal/simd_utils.cpp
     internal/shuffle.cpp
@@ -88,7 +89,7 @@ set(unit_internal_SOURCES
 
 add_executable(unit-internal ${unit_internal_SOURCES})
 target_link_libraries(unit-internal hs gtest corpusomatic)
-endif(NOT RELEASE_BUILD)
+endif(NOT (RELEASE_BUILD OR FAT_RUNTIME))
 
 set(unit_hyperscan_SOURCES
     hyperscan/allocators.cpp
diff --git a/unit/hyperscan/arg_checks.cpp b/unit/hyperscan/arg_checks.cpp
index d277a26b..8e86cc64 100644
--- a/unit/hyperscan/arg_checks.cpp
+++ b/unit/hyperscan/arg_checks.cpp
@@ -84,6 +84,12 @@ void breakDatabaseBytecode(hs_database *db) {
     *bytecode += 3;
 }
 
+// Check that hs_valid_platform says we can run here
+TEST(HyperscanArgChecks, ValidPlatform) {
+    hs_error_t error = hs_valid_platform();
+    ASSERT_EQ(HS_SUCCESS, error) << "hs_valid_platform should return zero";
+}
+
 // Check that hs_version gives us a reasonable string back
 TEST(HyperscanArgChecks, Version) {
     const char *version = hs_version();
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index 4d476932..31aaf17f 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -436,3 +436,16 @@ TEST(BitUtils, rank_in_mask64) {
     ASSERT_EQ(15, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 31));
     ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
 }
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+TEST(BitUtils, pdep64) {
+    u64a data = 0xF123456789ABCDEF;
+    ASSERT_EQ(0xfULL, pdep64(data, 0xf));
+    ASSERT_EQ(0xefULL, pdep64(data, 0xff));
+    ASSERT_EQ(0xf0ULL, pdep64(data, 0xf0));
+    ASSERT_EQ(0xfULL, pdep64(data, 0xf));
+    ASSERT_EQ(0xef0ULL, pdep64(data, 0xff0));
+    ASSERT_EQ(0xef00ULL, pdep64(data, 0xff00));
+    ASSERT_EQ(0xd0e0f00ULL, pdep64(data, 0xf0f0f00));
+}
+#endif
diff --git a/unit/internal/fdr.cpp b/unit/internal/fdr.cpp
index c66ab4c5..6116bfdb 100644
--- a/unit/internal/fdr.cpp
+++ b/unit/internal/fdr.cpp
@@ -337,8 +337,8 @@ TEST_P(FDRp, NoRepeat3) {
 static
 hwlm_error_t safeExecStreaming(const FDR *fdr, const u8 *hbuf, size_t hlen,
                                const u8 *buf, size_t len, size_t start,
-                               HWLMCallback cb, void *ctxt, hwlm_group_t groups,
-                               u8 *stream_state) {
+                               HWLMCallback cb, void *ctxt,
+                               hwlm_group_t groups) {
     array<u8, 16> wrapped_history = {{'0', '1', '2', '3', '4', '5', '6', '7',
                                       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}};
     if (hlen < 16) {
@@ -346,8 +346,7 @@ hwlm_error_t safeExecStreaming(const FDR *fdr, const u8 *hbuf, size_t hlen,
         memcpy(new_hbuf, hbuf, hlen);
         hbuf = new_hbuf;
     }
-    return fdrExecStreaming(fdr, hbuf, hlen, buf, len, start, cb, ctxt, groups,
-                            stream_state);
+    return fdrExecStreaming(fdr, hbuf, hlen, buf, len, start, cb, ctxt, groups);
 }
 
 TEST_P(FDRp, SmallStreaming) {
@@ -366,7 +365,7 @@ TEST_P(FDRp, SmallStreaming) {
     expected.push_back(match(2, 2, 1));
 
     safeExecStreaming(fdr.get(), (const u8 *)"", 0, (const u8 *)"aaar", 4, 0,
-                      decentCallback, &matches, HWLM_ALL_GROUPS, nullptr);
+                      decentCallback, &matches, HWLM_ALL_GROUPS);
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i]);
     }
@@ -378,7 +377,7 @@ TEST_P(FDRp, SmallStreaming) {
     expected.push_back(match(1, 8, 10));
 
     safeExecStreaming(fdr.get(), (const u8 *)"aaar", 4, (const u8 *)"dvark", 5,
-                      0, decentCallback, &matches, HWLM_ALL_GROUPS, nullptr);
+                      0, decentCallback, &matches, HWLM_ALL_GROUPS);
 
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i] + 4);
@@ -407,7 +406,7 @@ TEST_P(FDRp, SmallStreaming2) {
 
     safeExecStreaming(fdr.get(), (const u8 *)"foobar", 6,
                       (const u8 *)"aardvarkkk", 10, 0, decentCallback, &matches,
-                      HWLM_ALL_GROUPS, nullptr);
+                      HWLM_ALL_GROUPS);
 
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i] + 6);
@@ -445,44 +444,6 @@ TEST_P(FDRp, LongLiteral) {
     EXPECT_EQ(0U, count);
 }
 
-TEST_P(FDRp, VeryLongLiteral) {
-    const u32 hint = GetParam();
-    SCOPED_TRACE(hint);
-    vector<hwlmLiteral> lits;
-
-    string s1000;
-    for(int i = 0; i < 1000; i++) {
-        s1000 += char('A' + i % 10);
-    }
-
-    string s66k;
-    for(int i = 0; i < 66; i++) {
-        s66k += s1000;
-    }
-
-    string corpus = s66k + s66k;
-    lits.push_back(hwlmLiteral(s66k.c_str(), 0, 10));
-
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
-    CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
-
-    vector<match> matches;
-    u32 rv = fdrExec(fdr.get(), (const u8 *)s66k.c_str(), s66k.size(), 0,
-                     decentCallback, &matches, HWLM_ALL_GROUPS);
-    EXPECT_EQ(0U, rv);
-    ASSERT_EQ(1U, matches.size());
-    ASSERT_EQ(match(0, 65999, 10), matches[0]);
-
-    matches.clear();
-    rv = fdrExec(fdr.get(), (const u8 *)corpus.c_str(), corpus.size(), 0,
-                 decentCallback, &matches, HWLM_ALL_GROUPS);
-    EXPECT_EQ(0U, rv);
-    for (u32 i = 0; i < matches.size(); i++) {
-        ASSERT_EQ(match(10 * i, 65999 + 10 * i, 10), matches[i]);
-    }
-    EXPECT_EQ(6601U, matches.size());
-}
-
 TEST_P(FDRp, moveByteStream) {
     const u32 hint = GetParam();
     SCOPED_TRACE(hint);
@@ -538,9 +499,9 @@ TEST_P(FDRp, Stream1) {
     // check matches
     vector<match> matches;
 
-    fdrStatus = safeExecStreaming(
-        fdr.get(), (const u8 *)data1, data_len1, (const u8 *)data2, data_len2,
-        0, decentCallback, &matches, HWLM_ALL_GROUPS, nullptr);
+    fdrStatus = safeExecStreaming(fdr.get(), (const u8 *)data1, data_len1,
+                                  (const u8 *)data2, data_len2, 0,
+                                  decentCallback, &matches, HWLM_ALL_GROUPS);
     ASSERT_EQ(0, fdrStatus);
 
     ASSERT_EQ(4U, matches.size());
@@ -783,9 +744,9 @@ TEST(FDR, FDRTermS) {
     // check matches
     vector<match> matches;
 
-    fdrStatus = safeExecStreaming(
-        fdr.get(), (const u8 *)data1, data_len1, (const u8 *)data2, data_len2,
-        0, decentCallbackT, &matches, HWLM_ALL_GROUPS, nullptr);
+    fdrStatus = safeExecStreaming(fdr.get(), (const u8 *)data1, data_len1,
+                                  (const u8 *)data2, data_len2, 0,
+                                  decentCallbackT, &matches, HWLM_ALL_GROUPS);
     ASSERT_EQ(HWLM_TERMINATED, fdrStatus);
 
     ASSERT_EQ(1U, matches.size());
@@ -812,30 +773,3 @@ TEST(FDR, FDRTermB) {
 
     ASSERT_EQ(1U, matches.size());
 }
-
-TEST(FDR, ManyLengths) {
-    // UE-2400: we had a crash due to div by zero in the compiler when given a
-    // set of literals with precisely 512 different lengths.
-    const u32 num = 512;
-    vector<hwlmLiteral> lits;
-    char c = 0;
-    string s;
-    for (u32 i = 0; i < num; i++) {
-        s.push_back(c++);
-        lits.push_back(hwlmLiteral(s, false, i + 1));
-    }
-
-    auto fdr = fdrBuildTable(lits, false, get_current_target(), Grey());
-    ASSERT_TRUE(fdr != nullptr);
-
-    // Confirm that we can scan against this FDR table as well.
-
-    vector<match> matches;
-
-    hwlm_error_t fdrStatus =
-        fdrExec(fdr.get(), (const u8 *)s.c_str(), s.size(), 0, decentCallback,
-                &matches, HWLM_ALL_GROUPS);
-    ASSERT_EQ(HWLM_SUCCESS, fdrStatus);
-
-    ASSERT_EQ(768U, matches.size());
-}
diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp
index 68d8f632..7b00ac4c 100644
--- a/unit/internal/fdr_flood.cpp
+++ b/unit/internal/fdr_flood.cpp
@@ -495,7 +495,7 @@ TEST_P(FDRFloodp, StreamingMask) {
             const u8 *fhist = fake_history.data() + fake_history_size;
             fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0,
                                          countCallback, &matchesCounts,
-                                         HWLM_ALL_GROUPS, nullptr);
+                                         HWLM_ALL_GROUPS);
             ASSERT_EQ(0, fdrStatus);
             for (u32 j = streamChunk; j < dataSize; j += streamChunk) {
                 if (j < 16) {
@@ -506,12 +506,12 @@ TEST_P(FDRFloodp, StreamingMask) {
                     fdrStatus = fdrExecStreaming(fdr.get(), tmp_d, j, tmp_d + j,
                                                  streamChunk, 0, countCallback,
                                                  &matchesCounts,
-                                                 HWLM_ALL_GROUPS, nullptr);
+                                                 HWLM_ALL_GROUPS);
                 } else {
                     fdrStatus = fdrExecStreaming(fdr.get(), d + j - 8, 8, d + j,
                                                  streamChunk, 0, countCallback,
                                                  &matchesCounts,
-                                                 HWLM_ALL_GROUPS, nullptr);
+                                                 HWLM_ALL_GROUPS);
                 }
                 ASSERT_EQ(0, fdrStatus);
             }
diff --git a/unit/internal/graph.cpp b/unit/internal/graph.cpp
index 3ab3326d..b7ec7b03 100644
--- a/unit/internal/graph.cpp
+++ b/unit/internal/graph.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,10 +29,14 @@
 #include "config.h"
 #include "gtest/gtest.h"
 #include "util/graph.h"
+#include "util/ue2_graph.h"
 
 #include <boost/graph/adjacency_iterator.hpp>
 #include <boost/graph/adjacency_list.hpp>
 #include <boost/graph/graph_traits.hpp>
+#include <boost/graph/reverse_graph.hpp>
+
+#include <type_traits>
 
 using namespace boost;
 using namespace std;
@@ -167,107 +171,1617 @@ TEST(graph_util, degrees) {
     ASSERT_TRUE( has_proper_successor(d, g));
     ASSERT_FALSE(has_proper_successor(e, g));
     ASSERT_TRUE( has_proper_successor(f, g));
-
-    ASSERT_TRUE( hasGreaterInDegree(0, a, g));
-    ASSERT_FALSE(hasGreaterInDegree(1, a, g));
-    ASSERT_TRUE( hasGreaterInDegree(2, b, g));
-    ASSERT_FALSE(hasGreaterInDegree(3, b, g));
-    ASSERT_TRUE( hasGreaterInDegree(1, c, g));
-    ASSERT_FALSE(hasGreaterInDegree(2, c, g));
-    ASSERT_FALSE(hasGreaterInDegree(0, d, g));
-    ASSERT_TRUE( hasGreaterInDegree(1, e, g));
-    ASSERT_FALSE(hasGreaterInDegree(2, e, g));
-    ASSERT_FALSE(hasGreaterInDegree(0, f, g));
-
-    ASSERT_TRUE( hasGreaterOutDegree(0, a, g));
-    ASSERT_FALSE(hasGreaterOutDegree(1, a, g));
-    ASSERT_TRUE( hasGreaterOutDegree(1, b, g));
-    ASSERT_FALSE(hasGreaterOutDegree(2, b, g));
-    ASSERT_FALSE(hasGreaterOutDegree(0, c, g));
-    ASSERT_TRUE( hasGreaterOutDegree(0, d, g));
-    ASSERT_FALSE(hasGreaterOutDegree(1, d, g));
-    ASSERT_TRUE( hasGreaterOutDegree(0, e, g));
-    ASSERT_FALSE(hasGreaterOutDegree(1, e, g));
-    ASSERT_TRUE( hasGreaterOutDegree(2, f, g));
-    ASSERT_FALSE(hasGreaterOutDegree(3, f, g));
 }
 
-TEST(graph_util, in_degree_equal_to_1) {
-    unit_graph g;
+struct SimpleV {
+    size_t index;
+    string test_v = "SimpleV";
+};
 
-    unit_vertex a = add_vertex(g);
-    unit_vertex b = add_vertex(g);
-    unit_vertex c = add_vertex(g);
-    unit_vertex d = add_vertex(g);
+struct SimpleE {
+    size_t index;
+    string test_e = "SimpleE";
+};
 
-    ASSERT_TRUE(in_degree_equal_to(a, g, 0));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 1));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 2));
+struct SimpleG : public ue2_graph<SimpleG, SimpleV, SimpleE> {
+};
+
+TEST(ue2_graph, graph_concept) {
+    static_assert(std::is_same<SimpleG::vertex_descriptor,
+                               graph_traits<SimpleG>::vertex_descriptor>::value,
+                  "vertex_descriptor");
+    static_assert(std::is_same<SimpleG::edge_descriptor,
+                               graph_traits<SimpleG>::edge_descriptor>::value,
+                  "edge_descriptor");
+    static_assert(std::is_same<SimpleG::directed_category,
+                               graph_traits<SimpleG>::directed_category>::value,
+                  "directed_category");
+    static_assert(std::is_same<SimpleG::edge_parallel_category,
+                           graph_traits<SimpleG>::edge_parallel_category>::value,
+                  "edge_parallel_category");
+    static_assert(std::is_same<SimpleG::traversal_category,
+                               graph_traits<SimpleG>::traversal_category>::value,
+                  "traversal_category");
+
+    UNUSED SimpleG::vertex_descriptor n = SimpleG::null_vertex();
+
+    BOOST_CONCEPT_ASSERT((GraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, vertex_list_concept) {
+    BOOST_CONCEPT_ASSERT((VertexListGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, edge_list_concept) {
+    BOOST_CONCEPT_ASSERT((EdgeListGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, incidence_concept) {
+    BOOST_CONCEPT_ASSERT((IncidenceGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, bidi_concept) {
+    BOOST_CONCEPT_ASSERT((BidirectionalGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, mutable_concept) {
+    BOOST_CONCEPT_ASSERT((MutableGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, property_concept) {
+    static_assert(std::is_same<SimpleG::vertex_property_type, SimpleV>::value,
+                  "vertex_property_type");
+    static_assert(std::is_same<SimpleG::edge_property_type, SimpleE>::value,
+                  "edge_property_type");
+
+    /* Although documented as part of the MutablePropertyGraph concept,
+     * (vertex|edge)_property_type don't appear to exist in the traits for any
+     * existing graph types and the typedefs are not installed by default */
+
+    // static_assert(std::is_same<
+    //                   typename graph_traits<SimpleG>::vertex_property_type,
+    //                   SimpleV>::value,
+    //               "vertex_property_type");
+    // static_assert(std::is_same<
+    //                   typename graph_traits<SimpleG>::edge_property_type,
+    //                   SimpleE>::value,
+    //               "edge_property_type");
+
+    /* However, there does seem to be an undocumented templated structure
+     * paralleling the main graph_traits */
+    static_assert(std::is_same<
+                      typename vertex_property_type<SimpleG>::type,
+                      SimpleV>::value,
+                  "vertex_property_type");
+    static_assert(std::is_same<
+                      typename edge_property_type<SimpleG>::type,
+                      SimpleE>::value,
+                  "edge_property_type");
+
+    BOOST_CONCEPT_ASSERT((VertexMutablePropertyGraphConcept<SimpleG>));
+    BOOST_CONCEPT_ASSERT((EdgeMutablePropertyGraphConcept<SimpleG>));
+}
+
+TEST(ue2_graph, add_vertex) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+}
+
+TEST(ue2_graph, add_and_remove_vertex) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_vertices(g));
+
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_EQ(1U, num_vertices(g));
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    auto p = vertices(g);
+    ASSERT_NE(p.first, p.second);
+    ASSERT_EQ(a, *p.first);
+    ++p.first;
+    ASSERT_EQ(p.first, p.second);
+
+    remove_vertex(a, g);
+    ASSERT_EQ(0U, num_vertices(g));
+    auto q = vertices(g);
+    ASSERT_EQ(q.first, q.second);
+}
+
+TEST(ue2_graph, add_edge) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    auto q = edge(a, b, g);
+    ASSERT_TRUE(q.second);
+    ASSERT_EQ(p.second, q.first);
+}
+
+TEST(ue2_graph, add_remove_edge1) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    remove_edge(p.first, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_remove_edge2) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    remove_edge(a, b, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_edge_clear1) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    clear_vertex(a, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_edge_clear2) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    clear_vertex(b, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_edge_clear_out) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    clear_out_edges(a, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_edge_clear_in) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    clear_in_edges(b, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, add_remove_edge_iter) {
+    SimpleG g;
+    SimpleG::vertex_descriptor a = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), a);
+    SimpleG::vertex_descriptor b = add_vertex(g);
+    ASSERT_NE(SimpleG::null_vertex(), b);
+    ASSERT_NE(a, b);
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+
+    ASSERT_EQ(a, source(p.first, g));
+    ASSERT_EQ(b, target(p.first, g));
+
+    remove_edge(edges(g).first, g);
+    auto q = edge(a, b, g);
+    ASSERT_FALSE(q.second);
+    ASSERT_EQ(q.first, SimpleG::null_edge());
+    ASSERT_EQ(0U, num_edges(g));
+}
+
+TEST(ue2_graph, vertices_0) {
+    SimpleG g;
+    auto p = vertices(g);
+    ASSERT_EQ(p.first, p.second);
+}
+
+TEST(ue2_graph, vertices_1) {
+    SimpleG g;
+    SimpleG::vertex_iterator vi;
+    SimpleG::vertex_iterator ve;
+    auto a = add_vertex(g);
+
+    ASSERT_EQ(1U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(a, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    ASSERT_EQ(4U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(a, *vi++);
+    ASSERT_EQ(b, *vi++);
+    ASSERT_EQ(c, *vi++);
+    ASSERT_EQ(d, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    remove_vertex(c, g);
+
+    ASSERT_EQ(3U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(a, *vi++);
+    ASSERT_EQ(b, *vi++);
+    ASSERT_EQ(d, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    remove_vertex(a, g);
+
+    ASSERT_EQ(2U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(b, *vi++);
+    ASSERT_EQ(d, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    auto e = add_vertex(g);
+
+    ASSERT_EQ(3U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(b, *vi++);
+    ASSERT_EQ(d, *vi++);
+    ASSERT_EQ(e, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    remove_vertex(e, g);
+
+    ASSERT_EQ(2U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(b, *vi++);
+    ASSERT_EQ(d, *vi++);
+    ASSERT_EQ(vi, ve);
+
+    remove_vertex(b, g);
+    remove_vertex(d, g);
+
+    ASSERT_EQ(0U, num_vertices(g));
+    tie(vi, ve) = vertices(g);
+    ASSERT_EQ(vi, ve);
+}
+
+TEST(ue2_graph, out_edges_1) {
+    SimpleG g;
+    auto a = add_vertex(g);
+
+    ASSERT_EQ(1U, num_vertices(g));
+    ASSERT_EQ(0U, out_degree(a, g));
+
+    SimpleG::out_edge_iterator ei;
+    SimpleG::out_edge_iterator ee;
+
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_TRUE(ei == ee);
+
+    auto p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+    SimpleG::edge_descriptor e1 = p.first;
+
+    ASSERT_EQ(1U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e2 = p.first;
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+}
+
+TEST(ue2_graph, out_edges_2) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+
+    ASSERT_EQ(3U, num_vertices(g));
+    ASSERT_EQ(0U, out_degree(a, g));
+
+    SimpleG::out_edge_iterator ei;
+    SimpleG::out_edge_iterator ee;
+
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_TRUE(ei == ee);
+
+    auto p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+    SimpleG::edge_descriptor e1 = p.first;
+
+    ASSERT_EQ(1U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, c, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e2 = p.first;
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(c, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(3U, num_edges(g));
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(b, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(4U, num_edges(g));
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    remove_edge(a, c, g);
+    ASSERT_EQ(3U, num_edges(g));
+
+    ASSERT_EQ(1U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_EQ(4U, num_edges(g));
+    ASSERT_TRUE(p.second);
+    SimpleG::edge_descriptor e3 = p.first;
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e3, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    clear_out_edges(a, g);
+    ASSERT_EQ(2U, num_edges(g));
+
+    ASSERT_EQ(0U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(ei, ee);
+}
+
+TEST(ue2_graph, in_edges_1) {
+    SimpleG g;
+    auto a = add_vertex(g);
+
+    ASSERT_EQ(1U, num_vertices(g));
+    ASSERT_EQ(0U, in_degree(a, g));
+
+    SimpleG::in_edge_iterator ei;
+    SimpleG::in_edge_iterator ee;
+
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_TRUE(ei == ee);
+
+    auto p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+    SimpleG::edge_descriptor e1 = p.first;
+
+    ASSERT_EQ(1U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e2 = p.first;
+
+    ASSERT_EQ(2U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+}
+
+TEST(ue2_graph, in_edges_2) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+
+    ASSERT_EQ(3U, num_vertices(g));
+    ASSERT_EQ(0U, in_degree(a, g));
+
+    SimpleG::in_edge_iterator ei;
+    SimpleG::in_edge_iterator ee;
+
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_TRUE(ei == ee);
+
+    auto p = add_edge(b, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+    SimpleG::edge_descriptor e1 = p.first;
+
+    ASSERT_EQ(1U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(c, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e2 = p.first;
+
+    ASSERT_EQ(2U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(c, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(3U, num_edges(g));
+
+    ASSERT_EQ(2U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, b, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(4U, num_edges(g));
+
+    ASSERT_EQ(2U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    remove_edge(c, a, g);
+    ASSERT_EQ(3U, num_edges(g));
+
+    ASSERT_EQ(1U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_EQ(4U, num_edges(g));
+    ASSERT_TRUE(p.second);
+    SimpleG::edge_descriptor e3 = p.first;
+
+    ASSERT_EQ(2U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e3, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    clear_in_edges(a, g);
+    ASSERT_EQ(2U, num_edges(g));
+
+    ASSERT_EQ(0U, in_degree(a, g));
+    tie(ei, ee) = in_edges(a, g);
+    ASSERT_EQ(ei, ee);
+}
+
+TEST(ue2_graph, parallel_1) {
+    SimpleG g;
+    SimpleG::vertex_iterator vi;
+    SimpleG::vertex_iterator ve;
+    auto a = add_vertex(g);
+
+    ASSERT_EQ(1U, num_vertices(g));
+    ASSERT_EQ(0U, out_degree(a, g));
+
+    SimpleG::out_edge_iterator ei;
+    SimpleG::out_edge_iterator ee;
+
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_TRUE(ei == ee);
+
+    auto p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(1U, num_edges(g));
+    SimpleG::edge_descriptor e1 = p.first;
+
+    ASSERT_EQ(1U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e2 = p.first;
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(1U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    p = add_edge(a, a, g);
+    ASSERT_TRUE(p.second);
+    ASSERT_EQ(2U, num_edges(g));
+    SimpleG::edge_descriptor e3 = p.first;
+
+    ASSERT_EQ(2U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(e3, *ei++);
+    ASSERT_EQ(ei, ee);
+
+    remove_edge(a, a, g);
+    ASSERT_EQ(0U, out_degree(a, g));
+    tie(ei, ee) = out_edges(a, g);
+    ASSERT_EQ(ei, ee);
+}
+
+TEST(ue2_graph, edges_0a) {
+    SimpleG g;
+    auto p = edges(g);
+    ASSERT_EQ(p.first, p.second);
+}
+
+TEST(ue2_graph, edges_0b) {
+    SimpleG g;
+    add_vertex(g);
+    ASSERT_EQ(1U, num_vertices(g));
+    auto p = edges(g);
+    ASSERT_EQ(p.first, p.second);
+}
+
+TEST(ue2_graph, edges_0c) {
+    SimpleG g;
+    add_vertex(g);
+    add_vertex(g);
+    ASSERT_EQ(2U, num_vertices(g));
+    auto p = edges(g);
+    ASSERT_EQ(p.first, p.second);
+}
+
+TEST(ue2_graph, edges_1a) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    auto v = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(v, v, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_1b) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(u, v, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_1c) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(v, u, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_1d) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    UNUSED auto u = add_vertex(g);
+    UNUSED auto v = add_vertex(g);
+    auto w = add_vertex(g);
+    auto x = add_vertex(g);
+    UNUSED auto y = add_vertex(g);
+    UNUSED auto z = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(w, x, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_2a) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    auto v = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(v, v, g).first;
+    auto e2 = add_edge(v, v, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(2U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e2, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_2b) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(u, v, g).first;
+    auto e2 = add_edge(v, u, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(2U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(1U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e2, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_2c) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    UNUSED auto s = add_vertex(g);
+    UNUSED auto t = add_vertex(g);
+    auto u = add_vertex(g);
+    UNUSED auto v = add_vertex(g);
+    auto w = add_vertex(g);
+    auto x = add_vertex(g);
+    UNUSED auto y = add_vertex(g);
+    UNUSED auto z = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(w, x, g).first;
+    auto e2 = add_edge(u, x, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(2U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    clear_in_edges(x, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, edges_3a) {
+    SimpleG g;
+    ASSERT_EQ(0U, num_edges(g));
+
+    UNUSED auto s = add_vertex(g);
+    UNUSED auto t = add_vertex(g);
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto w = add_vertex(g);
+    auto x = add_vertex(g);
+    UNUSED auto y = add_vertex(g);
+    auto z = add_vertex(g);
+
+    ASSERT_EQ(0U, num_edges(g));
+    auto e1 = add_edge(w, x, g).first;
+    auto e2 = add_edge(u, v, g).first;
+    auto e3 = add_edge(u, z, g).first;
+
+    SimpleG::edge_iterator ei, ee;
+
+    ASSERT_EQ(3U, num_edges(g));
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(e2, *ei++);
+    ASSERT_EQ(e3, *ei++);
+    ASSERT_EQ(e1, *ei++);
+    ASSERT_EQ(ee, ei);
+
+    remove_edge(e1, g);
+
+    ASSERT_EQ(2U, num_edges(g));
+    clear_out_edges(u, g);
+
+    ASSERT_EQ(0U, num_edges(g));
+
+    tie(ei, ee) = edges(g);
+    ASSERT_EQ(ee, ei);
+}
+
+TEST(ue2_graph, degree) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    add_edge(a, b, g);
+    add_edge(a, c, g);
+    add_edge(a, d, g);
+
+    ASSERT_EQ(3U, degree(a, g));
+    ASSERT_EQ(1U, degree(b, g));
+    ASSERT_EQ(1U, degree(c, g));
+    ASSERT_EQ(1U, degree(d, g));
+
+    add_edge(b, c, g);
+
+    ASSERT_EQ(3U, degree(a, g));
+    ASSERT_EQ(2U, degree(b, g));
+    ASSERT_EQ(2U, degree(c, g));
+    ASSERT_EQ(1U, degree(d, g));
+
+    add_edge(d, d, g);
+    ASSERT_EQ(3U, degree(a, g));
+    ASSERT_EQ(2U, degree(b, g));
+    ASSERT_EQ(2U, degree(c, g));
+    ASSERT_EQ(3U, degree(d, g));
 
     add_edge(b, a, g);
+    ASSERT_EQ(4U, degree(a, g));
+    ASSERT_EQ(3U, degree(b, g));
+    ASSERT_EQ(2U, degree(c, g));
+    ASSERT_EQ(3U, degree(d, g));
 
-    ASSERT_FALSE(in_degree_equal_to(a, g, 0));
-    ASSERT_TRUE(in_degree_equal_to(a, g, 1));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 2));
+    add_edge(b, a, g);
+    ASSERT_EQ(5U, degree(a, g));
+    ASSERT_EQ(4U, degree(b, g));
+    ASSERT_EQ(2U, degree(c, g));
+    ASSERT_EQ(3U, degree(d, g));
 
-    add_edge(c, a, g);
-
-    ASSERT_FALSE(in_degree_equal_to(a, g, 0));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 1));
-    ASSERT_TRUE(in_degree_equal_to(a, g, 2));
-
-    add_edge(d, a, g);
-
-    ASSERT_FALSE(in_degree_equal_to(a, g, 0));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 1));
-    ASSERT_FALSE(in_degree_equal_to(a, g, 2));
+    add_edge(d, d, g);
+    ASSERT_EQ(5U, degree(a, g));
+    ASSERT_EQ(4U, degree(b, g));
+    ASSERT_EQ(2U, degree(c, g));
+    ASSERT_EQ(5U, degree(d, g));
 }
 
-TEST(graph_util, edge_by_target_1) {
-    unit_graph g;
+TEST(ue2_graph, adj) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
 
-    unit_vertex a = add_vertex(g);
-    unit_vertex b = add_vertex(g);
-    unit_vertex c = add_vertex(g);
+    add_edge(a, b, g);
+    add_edge(a, c, g);
+    add_edge(a, d, g);
+    add_edge(b, a, g);
+    add_edge(b, b, g);
 
-    ASSERT_FALSE(edge_by_target(a, a, g).second);
-    ASSERT_FALSE(edge_by_target(a, b, g).second);
-    ASSERT_FALSE(edge_by_target(a, c, g).second);
-    ASSERT_FALSE(edge_by_target(b, a, g).second);
-    ASSERT_FALSE(edge_by_target(c, b, g).second);
+    SimpleG::adjacency_iterator ai, ae;
+    tie(ai, ae) = adjacent_vertices(a, g);
+    ASSERT_EQ(b, *ai++);
+    ASSERT_EQ(c, *ai++);
+    ASSERT_EQ(d, *ai++);
+    ASSERT_EQ(ai, ae);
 
-    unit_edge ab = add_edge(a, b, g).first;
+    tie(ai, ae) = adjacent_vertices(b, g);
+    ASSERT_EQ(a, *ai++);
+    ASSERT_EQ(b, *ai++);
+    ASSERT_EQ(ai, ae);
 
-    ASSERT_FALSE(edge_by_target(a, a, g).second);
-    ASSERT_TRUE(edge_by_target(a, b, g).second);
-    ASSERT_TRUE(ab == edge_by_target(a, b, g).first);
-    ASSERT_FALSE(edge_by_target(a, c, g).second);
-    ASSERT_FALSE(edge_by_target(b, a, g).second);
-    ASSERT_FALSE(edge_by_target(b, b, g).second);
-    ASSERT_FALSE(edge_by_target(c, b, g).second);
+    tie(ai, ae) = adjacent_vertices(c, g);
+    ASSERT_EQ(ai, ae);
 
-    unit_edge cb = add_edge(c, b, g).first;
-
-    ASSERT_FALSE(edge_by_target(a, a, g).second);
-    ASSERT_TRUE(edge_by_target(a, b, g).second);
-    ASSERT_TRUE(ab == edge_by_target(a, b, g).first);
-    ASSERT_FALSE(edge_by_target(a, c, g).second);
-    ASSERT_FALSE(edge_by_target(b, a, g).second);
-    ASSERT_FALSE(edge_by_target(b, b, g).second);
-    ASSERT_TRUE(edge_by_target(c, b, g).second);
-    ASSERT_TRUE(cb == edge_by_target(c, b, g).first);
-
-    unit_edge aa = add_edge(a, a, g).first;
-    unit_edge bb = add_edge(b, b, g).first;
-
-    ASSERT_TRUE(edge_by_target(a, a, g).second);
-    ASSERT_TRUE(aa == edge_by_target(a, a, g).first);
-    ASSERT_TRUE(edge_by_target(a, b, g).second);
-    ASSERT_TRUE(ab == edge_by_target(a, b, g).first);
-    ASSERT_FALSE(edge_by_target(a, c, g).second);
-    ASSERT_FALSE(edge_by_target(b, a, g).second);
-    ASSERT_TRUE(edge_by_target(b, b, g).second);
-    ASSERT_TRUE(bb == edge_by_target(b, b, g).first);
-    ASSERT_TRUE(edge_by_target(c, b, g).second);
-    ASSERT_TRUE(cb == edge_by_target(c, b, g).first);
+    tie(ai, ae) = adjacent_vertices(d, g);
+    ASSERT_EQ(ai, ae);
+}
+
+TEST(ue2_graph, inv_adj) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    add_edge(a, b, g);
+    add_edge(a, c, g);
+    add_edge(a, d, g);
+    add_edge(b, a, g);
+    add_edge(b, b, g);
+
+    SimpleG::inv_adjacency_iterator ai, ae;
+    tie(ai, ae) = inv_adjacent_vertices(a, g);
+    ASSERT_EQ(b, *ai++);
+    ASSERT_EQ(ai, ae);
+
+    tie(ai, ae) = inv_adjacent_vertices(b, g);
+    ASSERT_EQ(a, *ai++);
+    ASSERT_EQ(b, *ai++);
+    ASSERT_EQ(ai, ae);
+
+    tie(ai, ae) = inv_adjacent_vertices(c, g);
+    ASSERT_EQ(a, *ai++);
+    ASSERT_EQ(ai, ae);
+
+    tie(ai, ae) = inv_adjacent_vertices(d, g);
+    ASSERT_EQ(a, *ai++);
+    ASSERT_EQ(ai, ae);
+}
+
+TEST(ue2_graph, square_brackets_v) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    ASSERT_EQ(0U, g[a].index);
+    ASSERT_EQ(1U, g[b].index);
+    ASSERT_EQ(2U, g[c].index);
+    ASSERT_EQ(3U, g[d].index);
+
+    ASSERT_EQ("SimpleV", g[a].test_v);
+    ASSERT_EQ("SimpleV", g[b].test_v);
+    ASSERT_EQ("SimpleV", g[c].test_v);
+    ASSERT_EQ("SimpleV", g[d].test_v);
+
+    g[a].test_v = "a";
+    g[b].test_v = "b";
+    g[c].test_v = "c";
+    g[d].test_v = "d";
+
+    ASSERT_EQ("a", g[a].test_v);
+    ASSERT_EQ("b", g[b].test_v);
+    ASSERT_EQ("c", g[c].test_v);
+    ASSERT_EQ("d", g[d].test_v);
+}
+
+TEST(ue2_graph, square_brackets_e) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto b = add_edge(u, v, g).first;
+    auto c = add_edge(u, u, g).first;
+    auto d = add_edge(v, u, g).first;
+
+    ASSERT_EQ(0U, g[a].index);
+    ASSERT_EQ(1U, g[b].index);
+    ASSERT_EQ(2U, g[c].index);
+    ASSERT_EQ(3U, g[d].index);
+
+    ASSERT_EQ("SimpleE", g[a].test_e);
+    ASSERT_EQ("SimpleE", g[b].test_e);
+    ASSERT_EQ("SimpleE", g[c].test_e);
+    ASSERT_EQ("SimpleE", g[d].test_e);
+
+    g[a].test_e = "a";
+    g[b].test_e = "b";
+    g[c].test_e = "c";
+    g[d].test_e = "d";
+
+    ASSERT_EQ("a", g[a].test_e);
+    ASSERT_EQ("b", g[b].test_e);
+    ASSERT_EQ("c", g[c].test_e);
+    ASSERT_EQ("d", g[d].test_e);
+}
+
+TEST(ue2_graph, vertex_ordering_1) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    ASSERT_LE(a, b);
+    ASSERT_LE(a, c);
+    ASSERT_LE(a, d);
+    ASSERT_LE(b, c);
+    ASSERT_LE(b, d);
+    ASSERT_LE(c, d);
+
+    g[a].index = 5;
+    g[b].index = 0;
+    g[c].index = 3;
+    g[d].index = 1;
+
+    ASSERT_LE(a, b);
+    ASSERT_LE(a, c);
+    ASSERT_LE(a, d);
+    ASSERT_LE(b, c);
+    ASSERT_LE(b, d);
+    ASSERT_LE(c, d);
+}
+
+TEST(ue2_graph, vertex_ordering_2) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto c = add_vertex(g);
+    auto d = add_vertex(g);
+
+    set<SimpleG::vertex_descriptor> s;
+    s.insert(a);
+    s.insert(b);
+    s.insert(c);
+    s.insert(d);
+
+    auto it = s.begin();
+    ASSERT_EQ(a, *it++);
+    ASSERT_EQ(b, *it++);
+    ASSERT_EQ(c, *it++);
+    ASSERT_EQ(d, *it++);
+    ASSERT_EQ(it, s.end());
+
+    g[a].index = 5;
+    g[b].index = 0;
+    g[c].index = 3;
+    g[d].index = 1;
+
+    it = s.begin();
+    ASSERT_EQ(a, *it++);
+    ASSERT_EQ(b, *it++);
+    ASSERT_EQ(c, *it++);
+    ASSERT_EQ(d, *it++);
+    ASSERT_EQ(it, s.end());
+}
+
+TEST(ue2_graph, get_v_2_arg) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+
+    auto pm = get(&SimpleV::test_v, g);
+
+    ASSERT_EQ("SimpleV", pm[a]);
+    ASSERT_EQ("SimpleV", pm[b]);
+
+    pm[a] = "a";
+    pm[b] = "b";
+
+    ASSERT_EQ("a", pm[a]);
+    ASSERT_EQ("b", pm[b]);
+
+    ASSERT_EQ("a", g[a].test_v);
+    ASSERT_EQ("b", g[b].test_v);
+
+    g[a].test_v = "X";
+    g[b].test_v = "Y";
+
+    ASSERT_EQ("X", pm[a]);
+    ASSERT_EQ("Y", pm[b]);
+
+    ASSERT_EQ("X", get(pm, a));
+    ASSERT_EQ("Y", get(pm, b));
+
+    put(pm, a, "A");
+    put(pm, b, "B");
+
+    ASSERT_EQ("A", g[a].test_v);
+    ASSERT_EQ("B", g[b].test_v);
+}
+
+TEST(ue2_graph, get_v_2_arg_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+
+    auto pm = get(&SimpleV::test_v, gg);
+
+    ASSERT_EQ("SimpleV", pm[a]);
+    ASSERT_EQ("SimpleV", pm[b]);
+
+    g[a].test_v = "a";
+    g[b].test_v = "b";
+
+    ASSERT_EQ("a", pm[a]);
+    ASSERT_EQ("b", pm[b]);
+
+    ASSERT_EQ("a", get(pm, a));
+    ASSERT_EQ("b", get(pm, b));
+}
+
+TEST(ue2_graph, get_e_2_arg) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto b = add_edge(v, u, g).first;
+
+    auto pm = get(&SimpleE::test_e, g);
+
+    ASSERT_EQ("SimpleE", pm[a]);
+    ASSERT_EQ("SimpleE", pm[b]);
+
+    pm[a] = "a";
+    pm[b] = "b";
+
+    ASSERT_EQ("a", pm[a]);
+    ASSERT_EQ("b", pm[b]);
+
+    ASSERT_EQ("a", g[a].test_e);
+    ASSERT_EQ("b", g[b].test_e);
+
+    g[a].test_e = "X";
+    g[b].test_e = "Y";
+
+    ASSERT_EQ("X", pm[a]);
+    ASSERT_EQ("Y", pm[b]);
+
+    ASSERT_EQ("X", get(pm, a));
+    ASSERT_EQ("Y", get(pm, b));
+
+    put(pm, a, "A");
+    put(pm, b, "B");
+
+    ASSERT_EQ("A", g[a].test_e);
+    ASSERT_EQ("B", g[b].test_e);
+}
+
+TEST(ue2_graph, get_e_2_arg_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto b = add_edge(v, u, g).first;
+
+    auto pm = get(&SimpleE::test_e, gg);
+
+    ASSERT_EQ("SimpleE", pm[a]);
+    ASSERT_EQ("SimpleE", pm[b]);
+
+    g[a].test_e = "a";
+    g[b].test_e = "b";
+
+    ASSERT_EQ("a", pm[a]);
+    ASSERT_EQ("b", pm[b]);
+
+    ASSERT_EQ("a", get(pm, a));
+    ASSERT_EQ("b", get(pm, b));
+}
+
+TEST(ue2_graph, get_v_3_arg) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+
+    ASSERT_EQ("SimpleV", get(&SimpleV::test_v, g, a));
+    ASSERT_EQ("SimpleV", get(&SimpleV::test_v, g, a));
+
+    get(&SimpleV::test_v, g, a) = "a";
+    get(&SimpleV::test_v, g, b) = "b";
+
+    ASSERT_EQ("a", get(&SimpleV::test_v, g, a));
+    ASSERT_EQ("b", get(&SimpleV::test_v, g, b));
+
+    ASSERT_EQ("a", g[a].test_v);
+    ASSERT_EQ("b", g[b].test_v);
+
+    g[a].test_v = "X";
+    g[b].test_v = "Y";
+
+    ASSERT_EQ("X", get(&SimpleV::test_v, g, a));
+    ASSERT_EQ("Y", get(&SimpleV::test_v, g, b));
+
+    //std::decay<decltype(get(&SimpleV::test_v, g)[a])>::type x = "A";
+
+    put(&SimpleV::test_v, g, a, "A");
+    put(&SimpleV::test_v, g, b, "B");
+
+    ASSERT_EQ("A", g[a].test_v);
+    ASSERT_EQ("B", g[b].test_v);
+}
+
+TEST(ue2_graph, get_v_3_arg_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+
+    ASSERT_EQ("SimpleV", get(&SimpleV::test_v, gg, a));
+    ASSERT_EQ("SimpleV", get(&SimpleV::test_v, gg, b));
+
+    g[a].test_v = "a";
+    g[b].test_v = "b";
+
+    ASSERT_EQ("a", get(&SimpleV::test_v, gg, a));
+    ASSERT_EQ("b", get(&SimpleV::test_v, gg, b));
+}
+
+TEST(ue2_graph, get_e_3_arg) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto b = add_edge(v, u, g).first;
+
+    ASSERT_EQ("SimpleE", get(&SimpleE::test_e, g, a));
+    ASSERT_EQ("SimpleE", get(&SimpleE::test_e, g, b));
+
+    get(&SimpleE::test_e, g, a) = "a";
+    get(&SimpleE::test_e, g, b) = "b";
+
+    ASSERT_EQ("a", get(&SimpleE::test_e, g, a));
+    ASSERT_EQ("b", get(&SimpleE::test_e, g, b));
+
+    ASSERT_EQ("a", g[a].test_e);
+    ASSERT_EQ("b", g[b].test_e);
+
+    g[a].test_e = "X";
+    g[b].test_e = "Y";
+
+    ASSERT_EQ("X", get(&SimpleE::test_e, g, a));
+    ASSERT_EQ("Y", get(&SimpleE::test_e, g, b));
+}
+
+TEST(ue2_graph, get_e_3_arg_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto b = add_edge(v, u, g).first;
+
+    ASSERT_EQ("SimpleE", get(&SimpleE::test_e, gg, a));
+    ASSERT_EQ("SimpleE", get(&SimpleE::test_e, gg, b));
+
+    g[a].test_e = "a";
+    g[b].test_e = "b";
+
+    ASSERT_EQ("a", get(&SimpleE::test_e, gg, a));
+    ASSERT_EQ("b", get(&SimpleE::test_e, gg, b));
+}
+
+TEST(ue2_graph, get_vertex_index) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_index, g);
+    ASSERT_EQ(0U, pm(a));
+    pm(a) = 1;
+    ASSERT_EQ(1U, pm[a]);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(vertex_index, g, a));
+}
+
+TEST(ue2_graph, get_vertex_index_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_index, gg);
+    ASSERT_EQ(0U, pm(a));
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a]);
+    ASSERT_EQ(1U, get(vertex_index, gg, a));
+}
+
+TEST(ue2_graph, get_edge_index) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_index, g);
+    ASSERT_EQ(0U, pm(a));
+    pm(a) = 1;
+    ASSERT_EQ(1U, pm[a]);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(edge_index, g, a));
+}
+
+TEST(ue2_graph, get_edge_index_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_index, gg);
+    ASSERT_EQ(0U, pm(a));
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a]);
+    ASSERT_EQ(1U, get(edge_index, gg, a));
+}
+
+TEST(ue2_graph, get_vertex_all) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_all, g);
+    ASSERT_EQ(0U, pm(a).index);
+    pm(a).index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(vertex_all, g, a).index);
+    auto &a_all = get(vertex_all, g, a);
+    ASSERT_EQ(1U, a_all.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_all.index);
+}
+
+TEST(ue2_graph, get_vertex_all_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_all, gg);
+    ASSERT_EQ(0U, pm(a).index);
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, get(vertex_all, gg, a).index);
+    auto &a_all = get(vertex_all, gg, a);
+    ASSERT_EQ(1U, a_all.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_all.index);
+}
+
+TEST(ue2_graph, get_vertex_bundle) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_bundle, g);
+    ASSERT_EQ(0U, pm(a).index);
+    pm(a).index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(vertex_bundle, g, a).index);
+    auto &a_bundle = get(vertex_bundle, g, a);
+    ASSERT_EQ(1U, a_bundle.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_bundle.index);
+}
+
+TEST(ue2_graph, get_vertex_bundle_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto a = add_vertex(g);
+    auto pm = get(vertex_bundle, gg);
+    ASSERT_EQ(0U, pm(a).index);
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, get(vertex_bundle, gg, a).index);
+    auto &a_bundle = get(vertex_bundle, gg, a);
+    ASSERT_EQ(1U, a_bundle.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_bundle.index);
+}
+
+TEST(ue2_graph, get_edge_all) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_all, g);
+    ASSERT_EQ(0U, pm(a).index);
+    pm(a).index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(edge_all, g, a).index);
+    auto &a_all = get(edge_all, g, a);
+    ASSERT_EQ(1U, a_all.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_all.index);
+}
+
+TEST(ue2_graph, get_edge_all_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_all, gg);
+    ASSERT_EQ(0U, pm(a).index);
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, get(edge_all, gg, a).index);
+    auto &a_all = get(edge_all, gg, a);
+    ASSERT_EQ(1U, a_all.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_all.index);
+}
+
+TEST(ue2_graph, get_edge_bundle) {
+    SimpleG g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_bundle, g);
+    ASSERT_EQ(0U, pm(a).index);
+    pm(a).index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, g[a].index);
+    ASSERT_EQ(1U, get(edge_bundle, g, a).index);
+    auto &a_bundle = get(edge_bundle, g, a);
+    ASSERT_EQ(1U, a_bundle.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_bundle.index);
+}
+
+TEST(ue2_graph, get_edge_bundle_const) {
+    SimpleG g;
+    const SimpleG &gg = g;
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+    auto a = add_edge(u, v, g).first;
+    auto pm = get(edge_bundle, gg);
+    ASSERT_EQ(0U, pm(a).index);
+    g[a].index = 1;
+    ASSERT_EQ(1U, pm[a].index);
+    ASSERT_EQ(1U, get(edge_bundle, gg, a).index);
+    auto &a_bundle = get(edge_bundle, gg, a);
+    ASSERT_EQ(1U, a_bundle.index);
+    g[a].index = 2;
+    ASSERT_EQ(2U, a_bundle.index);
+}
+
+TEST(ue2_graph, add_vertex_prop) {
+    SimpleG g;
+    SimpleV vp;
+    vp.index = 42;
+    vp.test_v = "prop";
+    auto u = add_vertex(vp, g);
+    auto v = add_vertex(vp, g);
+
+    ASSERT_EQ(0U, g[u].index);
+    ASSERT_EQ(1U, g[v].index);
+
+    ASSERT_EQ("prop", g[u].test_v);
+    ASSERT_EQ("prop", g[v].test_v);
+}
+
+TEST(ue2_graph, add_edge_prop) {
+    SimpleG g;
+    SimpleE ep;
+    ep.index = 42;
+    ep.test_e = "prop";
+    auto u = add_vertex(g);
+    auto v = add_vertex(g);
+
+    auto e = add_edge(u, v, ep, g).first;
+    auto f = add_edge(u, v, ep, g).first;
+
+    ASSERT_EQ(0U, g[e].index);
+    ASSERT_EQ(1U, g[f].index);
+
+    ASSERT_EQ("prop", g[e].test_e);
+    ASSERT_EQ("prop", g[f].test_e);
+}
+
+TEST(ue2_graph, reverse_graph) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto e = add_edge(a, b, g).first;
+    reverse_graph<SimpleG, SimpleG &> rg(g);
+    auto index_map = get(vertex_index, rg);
+
+    ASSERT_EQ(0U, rg[a].index);
+    ASSERT_EQ(1U, rg[b].index);
+    ASSERT_EQ(0U, rg[e].index);
+
+    ASSERT_EQ(0U, get(vertex_index, rg, a));
+    ASSERT_EQ(1U, get(vertex_index, rg, b));
+    ASSERT_EQ(0U, get(edge_index, rg, edge(b, a, rg).first));
+
+    ASSERT_EQ(0U, index_map(a));
+    ASSERT_EQ(1U, index_map(b));
+
+    ASSERT_TRUE(edge(b, a, rg).second);
+    ASSERT_FALSE(edge(a, b, rg).second);
+}
+
+TEST(ue2_graph, reverse_graph_const) {
+    SimpleG g;
+    auto a = add_vertex(g);
+    auto b = add_vertex(g);
+    auto e = add_edge(a, b, g).first;
+    reverse_graph<const SimpleG, const SimpleG &> rg(g);
+    auto index_map = get(&SimpleV::index, rg);
+
+    // Note: reverse_graph fails to make bundles const so things break.
+    // ASSERT_EQ(0U, rg[a].index);
+    // ASSERT_EQ(1U, rg[b].index);
+    // ASSERT_EQ(0U, rg[e].index);
+
+    ASSERT_EQ(0U, get(vertex_index, g, a));
+    ASSERT_EQ(1U, get(vertex_index, g, b));
+    ASSERT_EQ(0U, get(edge_index, g, e));
+
+    ASSERT_EQ(0U, index_map(a));
+    ASSERT_EQ(1U, index_map(b));
+
+    ASSERT_TRUE(edge(b, a, rg).second);
+    ASSERT_FALSE(edge(a, b, rg).second);
+}
+
+TEST(ue2_graph, default_param) {
+    struct TestGraph : ue2_graph<TestGraph> { };
+    TestGraph g;
+
+    auto v = add_vertex(g);
+    auto e = add_edge(v, v, g).first;
+
+    ASSERT_EQ(0U, get(vertex_index, g, v));
+    ASSERT_EQ(0U, get(edge_index, g, e));
+#if !defined(_MSC_VER)
+    /* This makes MSVC up to VS2015 sad in ways that shouldn't happen. */
+    ASSERT_EQ(0U, get(&ue2::graph_detail::default_edge_property::index, g, e));
+#endif
 }
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 6bb4fcb9..804fcb1f 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -31,14 +31,12 @@
 
 #include "grey.h"
 #include "compiler/compiler.h"
-#include "nfa/limex_context.h"
 #include "nfa/limex_internal.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_limex.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_util.h"
 #include "util/alloc.h"
 #include "util/target_info.h"
@@ -167,11 +165,10 @@ TEST_P(LimExModelTest, QueueExec) {
 TEST_P(LimExModelTest, CompressExpand) {
     ASSERT_TRUE(nfa != nullptr);
 
-    // 64-bit NFAs assume during compression that they have >= 5 bytes of
-    // compressed NFA state, which isn't true for our 8-state test pattern. We
-    // skip this test for just these models.
-    if (nfa->scratchStateSize == 8) {
-        return;
+    u32 real_state_size = nfa->scratchStateSize;
+    /* Only look at 8 bytes for limex 64 (rather than the padding) */
+    if (nfa->type == LIMEX_NFA_64) {
+        real_state_size = sizeof(u64a);
     }
 
     initQueue();
@@ -195,8 +192,7 @@ TEST_P(LimExModelTest, CompressExpand) {
     memset(dest, 0xff, nfa->scratchStateSize);
     nfaExpandState(nfa.get(), dest, q.streamState, q.offset,
                    queue_prev_byte(&q, end));
-    ASSERT_TRUE(std::equal(dest, dest + nfa->scratchStateSize,
-                           full_state.get()));
+    ASSERT_TRUE(std::equal(dest, dest + real_state_size, full_state.get()));
 }
 
 TEST_P(LimExModelTest, InitCompressedState0) {
diff --git a/unit/internal/multi_bit.cpp b/unit/internal/multi_bit.cpp
index 925092b3..38da1d8a 100644
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@@ -30,10 +30,10 @@
 
 #include "gtest/gtest.h"
 #include "ue2common.h"
+#include "util/compile_error.h"
 #include "util/make_unique.h"
 #include "util/multibit.h"
 #include "util/multibit_build.h"
-#include "util/target_info.h"
 
 #include <algorithm>
 #include <memory>
@@ -1303,9 +1303,11 @@ static const MultiBitTestParam multibitTests[] = {
     { 1U << 29, 24413 },
     { 1U << 30, 50377 },
     { 1U << 31, 104729 },
-
-    // { UINT32_MAX, 104729 }, // Very slow
 };
 
 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));
 
+TEST(MultiBit, SizeTooBig) {
+    ASSERT_NO_THROW(mmbit_size(MMB_MAX_BITS));
+    ASSERT_THROW(mmbit_size(MMB_MAX_BITS + 1), ResourceLimitError);
+}
diff --git a/unit/internal/nfagraph_equivalence.cpp b/unit/internal/nfagraph_equivalence.cpp
index 3ca1923f..8fda9223 100644
--- a/unit/internal/nfagraph_equivalence.cpp
+++ b/unit/internal/nfagraph_equivalence.cpp
@@ -84,7 +84,7 @@ TEST(NFAGraph, RemoveEquivalence1) {
         ASSERT_TRUE(tmpcr.test('a'));
     }
     // check if we found our vertex
-    ASSERT_TRUE(a != nullptr);
+    ASSERT_TRUE(a != NGHolder::null_vertex());
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
     NFAVertex b = NGHolder::null_vertex();
@@ -101,8 +101,8 @@ TEST(NFAGraph, RemoveEquivalence1) {
         }
     }
     // check if we found our vertices
-    ASSERT_TRUE(b != nullptr);
-    ASSERT_TRUE(c != nullptr);
+    ASSERT_TRUE(b != NGHolder::null_vertex());
+    ASSERT_TRUE(c != NGHolder::null_vertex());
 
     // both vertices should have an edge to accept
     ASSERT_TRUE(edge(b, g.accept, g).second);
@@ -145,7 +145,7 @@ TEST(NFAGraph, RemoveEquivalence2) {
         ASSERT_TRUE(tmpcr.test('a'));
     }
     // check if we found our vertex
-    ASSERT_TRUE(a != nullptr);
+    ASSERT_TRUE(a != NGHolder::null_vertex());
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
     NFAVertex b = NGHolder::null_vertex();
@@ -162,8 +162,8 @@ TEST(NFAGraph, RemoveEquivalence2) {
         }
     }
     // check if we found our vertices
-    ASSERT_TRUE(b != nullptr);
-    ASSERT_TRUE(c != nullptr);
+    ASSERT_TRUE(b != NGHolder::null_vertex());
+    ASSERT_TRUE(c != NGHolder::null_vertex());
 
     // both new vertices should have edges from startDs
     ASSERT_TRUE(edge(g.startDs, b, g).second);
@@ -207,7 +207,7 @@ TEST(NFAGraph, RemoveEquivalence3) {
         ASSERT_TRUE(tmpcr.test('a'));
     }
     // check if we found our 'a'
-    ASSERT_TRUE(a != nullptr);
+    ASSERT_TRUE(a != NGHolder::null_vertex());
 
     // There should be an edge from 'a' to '.'
     ASSERT_EQ(1U, out_degree(a, g));
@@ -234,7 +234,6 @@ TEST(NFAGraph, RemoveEquivalence3) {
     NFAVertex X = NGHolder::null_vertex();
     NFAVertex Y = NGHolder::null_vertex();
     for (NFAVertex tmp : adjacent_vertices_range(dot2, g)) {
-
         // we already know about dot1, so skip it
         if (tmp == dot1) {
             continue;
@@ -251,8 +250,8 @@ TEST(NFAGraph, RemoveEquivalence3) {
         }
     }
     // check if we found both vertices
-    ASSERT_TRUE(X != nullptr);
-    ASSERT_TRUE(Y != nullptr);
+    ASSERT_TRUE(X != NGHolder::null_vertex());
+    ASSERT_TRUE(Y != NGHolder::null_vertex());
 
     // finally, check if these two vertices only have edges to accept
     ASSERT_EQ(1U, out_degree(X, g));
@@ -306,8 +305,8 @@ TEST(NFAGraph, RemoveEquivalence4) {
         }
     }
     // check if we found both vertices
-    ASSERT_TRUE(X != nullptr);
-    ASSERT_TRUE(Y != nullptr);
+    ASSERT_TRUE(X != NGHolder::null_vertex());
+    ASSERT_TRUE(Y != NGHolder::null_vertex());
 
     // now, find first dot from X
     ASSERT_EQ(1U, out_degree(X, g));
@@ -351,7 +350,7 @@ TEST(NFAGraph, RemoveEquivalence4) {
         }
     }
     // make sure we found our 'a'
-    ASSERT_TRUE(a != nullptr);
+    ASSERT_TRUE(a != NGHolder::null_vertex());
 
     // now, check if 'a' has an edge to accept
     ASSERT_EQ(1U, out_degree(a, g));
@@ -396,7 +395,7 @@ TEST(NFAGraph, RemoveEquivalence5) {
         ASSERT_TRUE(edge(v, v, g).second);
     }
     // check if we found our vertex
-    ASSERT_TRUE(v != nullptr);
+    ASSERT_TRUE(v != NGHolder::null_vertex());
 
     // now, find the vertex leading to accept
     NFAVertex v2 = NGHolder::null_vertex();
@@ -414,7 +413,7 @@ TEST(NFAGraph, RemoveEquivalence5) {
         ASSERT_TRUE(edge(tmp, g.accept, g).second);
     }
     // check if we found our vertex
-    ASSERT_TRUE(v2 != nullptr);
+    ASSERT_TRUE(v2 != NGHolder::null_vertex());
 }
 
 // catching UE-2692
@@ -452,7 +451,7 @@ TEST(NFAGraph, RemoveEquivalence6) {
         ASSERT_TRUE(edge(v, g.accept, g).second);
     }
     // check if we found our vertex
-    ASSERT_TRUE(v != nullptr);
+    ASSERT_TRUE(v != NGHolder::null_vertex());
 }
 
 // catching UE-2692
@@ -492,7 +491,7 @@ TEST(NFAGraph, RemoveEquivalence7) {
         ASSERT_EQ(1U, proper_out_degree(v, g));
     }
     // check if we found our vertex
-    ASSERT_TRUE(v != nullptr);
+    ASSERT_TRUE(v != NGHolder::null_vertex());
 
     // find the next vertex and ensure it has an edge to accept
     NFAVertex v2 = NGHolder::null_vertex();
@@ -511,7 +510,7 @@ TEST(NFAGraph, RemoveEquivalence7) {
         ASSERT_TRUE(edge(v2, g.accept, g).second);
     }
     // check if we found our vertex
-    ASSERT_TRUE(v2 != nullptr);
+    ASSERT_TRUE(v2 != NGHolder::null_vertex());
 }
 
 TEST(NFAGraph, RemoveEquivalence_Reports1) {
diff --git a/unit/internal/nfagraph_redundancy.cpp b/unit/internal/nfagraph_redundancy.cpp
index acb3cc7b..be9527fd 100644
--- a/unit/internal/nfagraph_redundancy.cpp
+++ b/unit/internal/nfagraph_redundancy.cpp
@@ -55,13 +55,13 @@ TEST(NFAGraph, RemoveRedundancy1) {
 
     unique_ptr<NGWrapper> graph(constructGraphWithCC("(a|b)c", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
+    NGHolder &g = *graph;
 
     // Run removeRedundancy
-    removeRedundancy(*graph, SOM_NONE);
-    NFAGraph &g = graph->g;
+    removeRedundancy(g, SOM_NONE);
 
     // Our graph should only have two non-special nodes
-    ASSERT_EQ((size_t)N_SPECIALS + 2, num_vertices(*graph));
+    ASSERT_EQ((size_t)N_SPECIALS + 2, num_vertices(g));
 
     // Dot-star start state should be connected to itself and a single other
     // vertex
@@ -98,13 +98,13 @@ TEST(NFAGraph, RemoveRedundancy2) {
     unique_ptr<NGWrapper> graph(constructGraphWithCC("a.*b?c", cc,
                                                      HS_FLAG_DOTALL));
     ASSERT_TRUE(graph.get() != nullptr);
+    NGHolder &g = *graph;
 
     // Run removeRedundancy
-    removeRedundancy(*graph, SOM_NONE);
-    NFAGraph &g = graph->g;
+    removeRedundancy(g, SOM_NONE);
 
     // Our graph should now have only 3 non-special vertices
-    ASSERT_EQ((size_t)N_SPECIALS + 3, num_vertices(*graph));
+    ASSERT_EQ((size_t)N_SPECIALS + 3, num_vertices(g));
 
     // Dot-star start state should be connected to itself and a single other
     // vertex
@@ -156,12 +156,12 @@ TEST(NFAGraph, RemoveRedundancy3) {
                                                      cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
-    unsigned countBefore = num_vertices(graph->g);
+    unsigned countBefore = num_vertices(*graph);
     removeRedundancy(*graph, SOM_NONE);
 
     // The '(a|b)?' construction (two states) should have disappeared, leaving
     // this expr as 'foobar.*teakettle'
-    ASSERT_EQ(countBefore - 2, num_vertices(graph->g));
+    ASSERT_EQ(countBefore - 2, num_vertices(*graph));
 }
 
 TEST(NFAGraph, RemoveRedundancy4) {
@@ -169,11 +169,11 @@ TEST(NFAGraph, RemoveRedundancy4) {
     unique_ptr<NGWrapper> graph(constructGraphWithCC("foo([A-Z]|a|b|q)", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
-    unsigned countBefore = num_vertices(graph->g);
+    unsigned countBefore = num_vertices(*graph);
     removeRedundancy(*graph, SOM_NONE);
 
     // We should end up with the alternation collapsing into one state
-    ASSERT_EQ(countBefore - 3, num_vertices(graph->g));
+    ASSERT_EQ(countBefore - 3, num_vertices(*graph));
 }
 
 TEST(NFAGraph, RemoveRedundancy5) {
@@ -182,12 +182,12 @@ TEST(NFAGraph, RemoveRedundancy5) {
             cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
-    unsigned countBefore = num_vertices(graph->g);
+    unsigned countBefore = num_vertices(*graph);
     removeRedundancy(*graph, SOM_NONE);
 
     // Since we don't return a start offset, the first state ('[0-9]?') is
     // redundant.
-    ASSERT_EQ(countBefore - 1, num_vertices(graph->g));
+    ASSERT_EQ(countBefore - 1, num_vertices(*graph));
 }
 
 TEST(NFAGraph, RemoveEdgeRedundancy1) {
@@ -196,12 +196,12 @@ TEST(NFAGraph, RemoveEdgeRedundancy1) {
     auto graph = constructGraphWithCC("A+hatstand", cc, HS_FLAG_DOTALL);
     ASSERT_TRUE(graph.get() != nullptr);
 
-    unsigned countBefore = num_edges(graph->g);
+    unsigned countBefore = num_edges(*graph);
 
     removeEdgeRedundancy(*graph, SOM_NONE, cc);
 
     // One edge (the self-loop on the leading A+) should have been removed.
-    ASSERT_EQ(countBefore - 1, num_edges(graph->g));
+    ASSERT_EQ(countBefore - 1, num_edges(*graph));
 }
 
 TEST(NFAGraph, RemoveEdgeRedundancy2) {
@@ -210,12 +210,12 @@ TEST(NFAGraph, RemoveEdgeRedundancy2) {
     auto graph = constructGraphWithCC("foo.*A*bar", cc, HS_FLAG_DOTALL);
     ASSERT_TRUE(graph.get() != nullptr);
 
-    size_t numEdgesBefore = num_edges(graph->g);
-    size_t numVertsBefore = num_vertices(graph->g);
+    size_t numEdgesBefore = num_edges(*graph);
+    size_t numVertsBefore = num_vertices(*graph);
 
     removeEdgeRedundancy(*graph, SOM_NONE, cc);
 
     // The .* should swallow up the A* and its self-loop.
-    ASSERT_EQ(numEdgesBefore - 4, num_edges(graph->g));
-    ASSERT_EQ(numVertsBefore - 1, num_vertices(graph->g));
+    ASSERT_EQ(numEdgesBefore - 4, num_edges(*graph));
+    ASSERT_EQ(numVertsBefore - 1, num_vertices(*graph));
 }
diff --git a/unit/internal/nfagraph_util.cpp b/unit/internal/nfagraph_util.cpp
index 135276dd..b6952f5a 100644
--- a/unit/internal/nfagraph_util.cpp
+++ b/unit/internal/nfagraph_util.cpp
@@ -320,9 +320,9 @@ TEST(NFAGraph, cyclicVerts1) {
     add_edge(a, b, g);
     add_edge(b, a, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts2) {
@@ -341,9 +341,9 @@ TEST(NFAGraph, cyclicVerts2) {
     add_edge(c, d, g);
     add_edge(a, e, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts3) {
@@ -369,9 +369,9 @@ TEST(NFAGraph, cyclicVerts3) {
     add_edge(f, h, g);
     add_edge(h, h, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c, d, e, h}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c, d, e, h}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts4) {
@@ -396,9 +396,9 @@ TEST(NFAGraph, cyclicVerts4) {
     add_edge(e, f, g);
     add_edge(f, h, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c, d, e}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c, d, e}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts5) {
@@ -418,7 +418,7 @@ TEST(NFAGraph, cyclicVerts5) {
     add_edge(c, d, g);
     add_edge(e, c, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, b, c}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, b, c}), cyclics);
 }
diff --git a/unit/internal/rose_build_merge.cpp b/unit/internal/rose_build_merge.cpp
index 3f5a8382..291c241a 100644
--- a/unit/internal/rose_build_merge.cpp
+++ b/unit/internal/rose_build_merge.cpp
@@ -64,7 +64,6 @@ RoseVertex addVertex(RoseBuildImpl &build, RoseVertex parent, u32 lit_id) {
     RoseGraph &g = build.g;
 
     RoseVertex v = add_vertex(g);
-    g[v].idx = build.vertexIndex++;
     g[v].min_offset = 0;
     g[v].max_offset = ROSE_BOUND_INF;
     g[v].literals.insert(lit_id);
diff --git a/unit/internal/rose_mask_32.cpp b/unit/internal/rose_mask_32.cpp
new file mode 100644
index 00000000..732f51a0
--- /dev/null
+++ b/unit/internal/rose_mask_32.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "rose/validate_mask.h"
+#include "gtest/gtest.h"
+
+#define ONES32 0xffffffffu
+
+union RoseLookaroundMask32 {
+    m256 a256;
+    u8 a8[32];
+};
+
+struct ValidateMask32TestInfo {
+    RoseLookaroundMask32 data;
+    u32 valid_mask;
+    RoseLookaroundMask32 and_mask;
+    RoseLookaroundMask32 cmp_mask;
+    u32 neg_mask;
+};
+
+struct ValidateMask32InitInfo {
+    int idx;
+    u8 data;
+    u8 and_mask;
+    u8 cmp_mask;
+    u8 neg_mask;
+};
+
+
+static const ValidateMask32InitInfo testBasicIdx[][33] = {
+    {
+        {1, 0x34, 0xf8, 0x30, 0},
+        {2, 0x34, 0xf8, 0x30, 0},
+        {8, 0x23, 0xff, 0x23, 0},
+        {9, 0x34, 0xf8, 0x30, 0},
+        {10, 0x41, 0xdf, 0x41, 0},
+        {11, 0x63, 0xdd, 0x41, 0},
+        {12, 0x61, 0xdd, 0x41, 0},
+        {13, 0x41, 0xdf, 0x41, 0},
+        {14, 0x61, 0xdf, 0x41, 0},
+        {15, 0x41, 0xdf, 0x41, 0},
+        {16, 0x43, 0xdd, 0x41, 0},
+        {17, 0x61, 0xdd, 0x41, 0},
+        {23, 0x63, 0xdd, 0x41, 0},
+        {24, 0x4f, 0xfc, 0x4c, 0},
+        {25, 0x4d, 0xfc, 0x4c, 0},
+        {26, 0x4d, 0xfc, 0x4c, 0},
+        {-1, 0, 0, 0, 0},
+    },
+    {
+        {11, 0, 0xff, 0x55, 1},
+        {12, 0, 0xff, 0x36, 1},
+        {13, 0, 0xfe, 0x34, 1},
+        {14, 0x4d, 0xfe, 0x4c, 0},
+        {15, 0x41, 0xbf, 0x01, 0},
+        {16, 0x53, 0xdf, 0x73, 1},
+        {17, 0x4b, 0, 0, 0},
+        {18, 0, 0x2c, 0x2c, 1},
+        {-1, 0, 0, 0, 0},
+    },
+    {
+        {15, 0x46, 0xdf, 0x46, 0},
+        {16, 0x4f, 0xdf, 0x46, 1},
+        {17, 0x6f, 0xff, 0x6f, 0},
+        {18, 0x31, 0xfe, 0x30, 0},
+        {19, 0x34, 0xf8, 0x30, 0},
+        {20, 0x66, 0xc0, 0x40, 0},
+        {21, 0x6f, 0xf0, 0x60, 0},
+        {22, 0x6f, 0, 0, 0},
+        {23, 0x46, 0xdf, 0x44, 1},
+        {24, 0x4f, 0xdf, 0x46, 1},
+        {25, 0x6f, 0xff, 0x4f, 1},
+        {26, 0x31, 0xfe, 0x30, 0},
+        {27, 0x34, 0xf8, 0x34, 1},
+        {28, 0x66, 0xc0, 0x60, 1},
+        {29, 0x6f, 0xf0, 0x6f, 1},
+        {30, 0x6f, 0, 0x60, 1},
+        {-1, 0, 0, 0, 0},
+    },
+    {
+        {31, 0x4a, 0x80, 0, 0},
+        {-1, 0, 0, 0, 1},
+    },
+    {
+        {12, 0x2b, 0x3d, 0x2d, 1},
+        {13, 0x2b, 0x3d, 0x4c, 1},
+        {23, 0x4a, 0x88, 0x0a, 1},
+        {-1, 0, 0, 0, 0},
+    },
+};
+
+static void initTestInfo(ValidateMask32TestInfo &t) {
+    t.data.a256 = zeroes256();
+    t.valid_mask = 0xffffffff;
+    t.and_mask.a256 = zeroes256();
+    t.cmp_mask.a256 = zeroes256();
+    t.neg_mask = 0;
+};
+
+
+static
+int testBasicInit(ValidateMask32TestInfo *testB) {
+    int len = 0;
+    ValidateMask32TestInfo t;
+    for (size_t i = 0; i < ARRAY_LENGTH(testBasicIdx); i++) {
+        initTestInfo(t);
+        for (const auto &line: testBasicIdx[i]) {
+            if (line.idx < 0) {
+                break;
+            }
+            int index = line.idx;
+            t.data.a8[index] = line.data;
+            t.and_mask.a8[index] = line.and_mask;
+            t.cmp_mask.a8[index] = line.cmp_mask;
+            t.neg_mask |= line.neg_mask << index;
+        }
+        testB[i] = t;
+        len++;
+    }
+    return len;
+}
+
+TEST(ValidateMask32, testMask32_1) {
+    ValidateMask32TestInfo testBasic[20];
+    int test_len = testBasicInit(testBasic);
+    for (int i = 0; i < test_len; i++) {
+        const auto t = testBasic[i];
+        EXPECT_EQ(1, validateMask32(t.data.a256, t.valid_mask,
+                                    t.and_mask.a256, t.cmp_mask.a256,
+                                    t.neg_mask));
+    }
+}
+
+TEST(ValidateMask32, testMask32_2) {
+    ValidateMask32TestInfo testBasic[20];
+    int test_len = testBasicInit(testBasic);
+    for (int left = 0; left <= 32; left++) {
+        for (int right = 0; right + left < 32; right++) {
+            u32 valid_mask = ONES32 << (left + right) >> left;
+            for (int i = 0; i < test_len; i++) {
+                const auto &t = testBasic[i];
+                int bool_result;
+                bool_result = !(valid_mask & t.neg_mask);
+                EXPECT_EQ(bool_result, validateMask32(t.data.a256,
+                                                      valid_mask,
+                                                      t.and_mask.a256,
+                                                      t.cmp_mask.a256,
+                                                      0));
+                bool_result = (valid_mask & t.neg_mask) == valid_mask;
+                EXPECT_EQ(bool_result, validateMask32(t.data.a256,
+                                                      valid_mask,
+                                                      t.and_mask.a256,
+                                                      t.cmp_mask.a256,
+                                                      ONES32));
+            }
+        }
+    }
+}
+
+TEST(ValidateMask32, testMask32_3) {
+    ValidateMask32TestInfo testBasic[20];
+    testing::internal::Random neg_mask_rand(451);
+    int test_len = testBasicInit(testBasic);
+    for (int left = 0; left <= 32; left++) {
+        for (int right = 0; right + left < 32; right++) {
+            u32 valid_mask = ONES32 << (left + right) >> left;
+            for (int i = 0; i < test_len; i++) {
+                const auto &t = testBasic[i];
+                int bool_result;
+                for (int j = 0; j < 5000; j++) {
+                    u32 neg_mask = neg_mask_rand.Generate(1u << 31);
+                    bool_result = (neg_mask & valid_mask) ==
+                                  (t.neg_mask & valid_mask);
+                    EXPECT_EQ(bool_result, validateMask32(t.data.a256,
+                                                          valid_mask,
+                                                          t.and_mask.a256,
+                                                          t.cmp_mask.a256,
+                                                          neg_mask));
+                }
+            }
+        }
+    }
+}
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index 614b641d..a4632c36 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -54,14 +54,14 @@ TEST(Shuffle, PackedExtract32_1) {
     for (unsigned int i = 0; i < 32; i++) {
         // shuffle a single 1 bit to the front
         u32 mask = 1U << i;
-        EXPECT_EQ(1U, packedExtract32(mask, mask));
-        EXPECT_EQ(1U, packedExtract32(~0U, mask));
+        EXPECT_EQ(1U, pext32(mask, mask));
+        EXPECT_EQ(1U, pext32(~0U, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, packedExtract32(0, mask));
-        EXPECT_EQ(0U, packedExtract32(~mask, mask));
+        EXPECT_EQ(0U, pext32(0, mask));
+        EXPECT_EQ(0U, pext32(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 32); j++) {
-            EXPECT_EQ(0U, packedExtract32((1U << j), mask));
+            EXPECT_EQ(0U, pext32((1U << j), mask));
         }
     }
 }
@@ -69,10 +69,10 @@ TEST(Shuffle, PackedExtract32_1) {
 TEST(Shuffle, PackedExtract32_2) {
     // All 32 bits in mask are on
     u32 mask = ~0U;
-    EXPECT_EQ(0U, packedExtract32(0, mask));
-    EXPECT_EQ(mask, packedExtract32(mask, mask));
+    EXPECT_EQ(0U, pext32(0, mask));
+    EXPECT_EQ(mask, pext32(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, packedExtract32(1U << i, mask));
+        EXPECT_EQ(1U << i, pext32(1U << i, mask));
     }
 }
 
@@ -84,16 +84,16 @@ TEST(Shuffle, PackedExtract32_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ((1U << 16) - 1, packedExtract32(mask, mask));
-    EXPECT_EQ((1U << 16) - 1, packedExtract32(~mask, ~mask));
-    EXPECT_EQ(0U, packedExtract32(~mask, mask));
-    EXPECT_EQ(0U, packedExtract32(mask, ~mask));
+    EXPECT_EQ((1U << 16) - 1, pext32(mask, mask));
+    EXPECT_EQ((1U << 16) - 1, pext32(~mask, ~mask));
+    EXPECT_EQ(0U, pext32(~mask, mask));
+    EXPECT_EQ(0U, pext32(mask, ~mask));
 
     for (unsigned int i = 0; i < 32; i += 2) {
-        EXPECT_EQ(1U << (i/2), packedExtract32(1U << i, mask));
-        EXPECT_EQ(0U, packedExtract32(1U << i, ~mask));
-        EXPECT_EQ(1U << (i/2), packedExtract32(1U << (i+1), ~mask));
-        EXPECT_EQ(0U, packedExtract32(1U << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), pext32(1U << i, mask));
+        EXPECT_EQ(0U, pext32(1U << i, ~mask));
+        EXPECT_EQ(1U << (i/2), pext32(1U << (i+1), ~mask));
+        EXPECT_EQ(0U, pext32(1U << (i+1), mask));
     }
 }
 
@@ -102,14 +102,14 @@ TEST(Shuffle, PackedExtract64_1) {
     for (unsigned int i = 0; i < 64; i++) {
         // shuffle a single 1 bit to the front
         u64a mask = 1ULL << i;
-        EXPECT_EQ(1U, packedExtract64(mask, mask));
-        EXPECT_EQ(1U, packedExtract64(~0ULL, mask));
+        EXPECT_EQ(1U, pext64(mask, mask));
+        EXPECT_EQ(1U, pext64(~0ULL, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, packedExtract64(0, mask));
-        EXPECT_EQ(0U, packedExtract64(~mask, mask));
+        EXPECT_EQ(0U, pext64(0, mask));
+        EXPECT_EQ(0U, pext64(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 64); j++) {
-            EXPECT_EQ(0U, packedExtract64((1ULL << j), mask));
+            EXPECT_EQ(0U, pext64((1ULL << j), mask));
         }
     }
 }
@@ -117,26 +117,26 @@ TEST(Shuffle, PackedExtract64_1) {
 TEST(Shuffle, PackedExtract64_2) {
     // Fill first half of mask
     u64a mask = 0x00000000ffffffffULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << i, pext64(1ULL << i, mask));
     }
 
     // Fill second half of mask
     mask = 0xffffffff00000000ULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 32; i < 64; i++) {
-        EXPECT_EQ(1U << (i - 32), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 32), pext64(1ULL << i, mask));
     }
 
     // Try one in the middle
     mask = 0x0000ffffffff0000ULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 16; i < 48; i++) {
-        EXPECT_EQ(1U << (i - 16), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 16), pext64(1ULL << i, mask));
     }
 }
 
@@ -148,16 +148,16 @@ TEST(Shuffle, PackedExtract64_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(~mask, ~mask));
-    EXPECT_EQ(0U, packedExtract64(~mask, mask));
-    EXPECT_EQ(0U, packedExtract64(mask, ~mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
+    EXPECT_EQ(0xffffffffU, pext64(~mask, ~mask));
+    EXPECT_EQ(0U, pext64(~mask, mask));
+    EXPECT_EQ(0U, pext64(mask, ~mask));
 
     for (unsigned int i = 0; i < 64; i += 2) {
-        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << i, mask));
-        EXPECT_EQ(0U, packedExtract64(1ULL << i, ~mask));
-        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << (i+1), ~mask));
-        EXPECT_EQ(0U, packedExtract64(1ULL << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), pext64(1ULL << i, mask));
+        EXPECT_EQ(0U, pext64(1ULL << i, ~mask));
+        EXPECT_EQ(1U << (i/2), pext64(1ULL << (i+1), ~mask));
+        EXPECT_EQ(0U, pext64(1ULL << (i+1), mask));
     }
 }
 
diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp
index 81495a9c..06407c41 100644
--- a/unit/internal/shufti.cpp
+++ b/unit/internal/shufti.cpp
@@ -47,7 +47,7 @@ TEST(Shufti, BuildMask1) {
 
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lomask, &himask);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lomask, (u8 *)&himask);
     ASSERT_NE(-1, ret);
 
     u8 *lo = (u8 *)&lomask;
@@ -75,7 +75,7 @@ TEST(Shufti, BuildMask2) {
     chars.set('a');
     chars.set('B');
 
-    int ret = shuftiBuildMasks(chars, &lomask, &himask);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lomask, (u8 *)&himask);
     ASSERT_NE(-1, ret);
 
     u8 *lo = (u8 *)&lomask;
@@ -96,7 +96,7 @@ TEST(Shufti, BuildMask4) {
     chars.set('A');
     chars.set('b');
 
-    int ret = shuftiBuildMasks(chars, &lomask, &himask);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lomask, (u8 *)&himask);
     ASSERT_NE(-1, ret);
 
     u8 *lo = (u8 *)&lomask;
@@ -113,12 +113,12 @@ TEST(Shufti, ExecNoMatch1) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
-    for (size_t i = 0; i < 16; i++) {
+    for (size_t i = 0; i < 32; i++) {
         const u8 *rv = shuftiExec(lo, hi, (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
 
         ASSERT_LE(((size_t)t1 + strlen(t1)) & ~0xf, (size_t)rv);
@@ -132,7 +132,7 @@ TEST(Shufti, ExecNoMatch2) {
     chars.set('a');
     chars.set('B');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -150,7 +150,7 @@ TEST(Shufti, ExecNoMatch3) {
     CharReach chars;
     chars.set('V'); /* V = 0x56, e = 0x65 */
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
@@ -168,16 +168,16 @@ TEST(Shufti, ExecMatch1) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
-    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbabbbbbbbbbbbb";
 
-    for (size_t i = 0; i < 16; i++) {
+    for (size_t i = 0; i < 32; i++) {
         const u8 *rv = shuftiExec(lo, hi, (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
 
-        ASSERT_EQ((size_t)t1 + 17, (size_t)rv);
+        ASSERT_EQ((size_t)t1 + 33, (size_t)rv);
     }
 }
 
@@ -187,7 +187,7 @@ TEST(Shufti, ExecMatch2) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -207,7 +207,7 @@ TEST(Shufti, ExecMatch3) {
     chars.set('a');
     chars.set('B');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -229,7 +229,7 @@ TEST(Shufti, ExecMatch4) {
     chars.set('A');
     chars.set('c');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -263,7 +263,7 @@ TEST(Shufti, ExecMatch5) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -283,8 +283,8 @@ TEST(DoubleShufti, BuildMask1) {
 
     lits.insert(make_pair('a', 'B'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
@@ -326,8 +326,8 @@ TEST(DoubleShufti, BuildMask2) {
     lits.insert(make_pair('a','z'));
     lits.insert(make_pair('B','z'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
@@ -354,8 +354,8 @@ TEST(DoubleShufti, BuildMask4) {
     lits.insert(make_pair('A','z'));
     lits.insert(make_pair('b','z'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
@@ -383,8 +383,8 @@ TEST(DoubleShufti, BuildMask5) {
     CharReach bytes;
     bytes.set('X');
 
-    bool ret = shuftiBuildDoubleMasks(bytes, lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool ret = shuftiBuildDoubleMasks(bytes, lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
@@ -421,8 +421,8 @@ TEST(DoubleShufti, BuildMask6) {
     lits.insert(make_pair('A','x'));
     lits.insert(make_pair('b','x'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
@@ -473,8 +473,8 @@ TEST(DoubleShufti, BuildMask7) {
     lits.insert(make_pair('u','v'));
     lits.insert(make_pair('w','x'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
-                                     &lo2m, &hi2m);
+    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1m, (u8 *)&hi1m,
+                                     (u8 *)&lo2m, (u8 *)&hi2m);
     ASSERT_FALSE(rv);
 }
 
@@ -485,8 +485,8 @@ TEST(DoubleShufti, ExecNoMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1,
-                                     &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                     (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -506,7 +506,8 @@ TEST(DoubleShufti, ExecNoMatch1b) {
 
     lits.insert(make_pair('b','a'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -527,7 +528,8 @@ TEST(DoubleShufti, ExecNoMatch2) {
     lits.insert(make_pair('a','b'));
     lits.insert(make_pair('B','b'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -548,14 +550,15 @@ TEST(DoubleShufti, ExecNoMatch2b) {
     lits.insert(make_pair('b','a'));
     lits.insert(make_pair('b','B'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
     for (size_t i = 0; i < 16; i++) {
-        const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2,
-                                        (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
+        const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2, (u8 *)t1 + i,
+                                        (u8 *)t1 + strlen(t1));
 
         ASSERT_EQ((size_t)t1 + i + 15, (size_t)rv);
     }
@@ -568,7 +571,8 @@ TEST(DoubleShufti, ExecNoMatch3) {
 
     lits.insert(make_pair('V','e'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
@@ -588,7 +592,8 @@ TEST(DoubleShufti, ExecNoMatch3b) {
 
     lits.insert(make_pair('e','V'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
@@ -601,6 +606,28 @@ TEST(DoubleShufti, ExecNoMatch3b) {
     }
 }
 
+TEST(DoubleShufti, ExecMatchShort1) {
+    m128 lo1, hi1, lo2, hi2;
+
+    flat_set<pair<u8, u8>> lits;
+
+    lits.insert(make_pair('a','b'));
+
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
+    ASSERT_TRUE(ret);
+
+    /*          0123456789012345678901234567890 */
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbb";
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2,
+                                        (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
+
+        ASSERT_EQ((size_t)t1 + 17, (size_t)rv);
+    }
+}
+
 TEST(DoubleShufti, ExecMatch1) {
     m128 lo1, hi1, lo2, hi2;
 
@@ -608,7 +635,8 @@ TEST(DoubleShufti, ExecMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
@@ -629,7 +657,8 @@ TEST(DoubleShufti, ExecMatch2) {
 
     lits.insert(make_pair('a','a'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
@@ -651,7 +680,8 @@ TEST(DoubleShufti, ExecMatch3) {
     lits.insert(make_pair('B','a'));
     lits.insert(make_pair('a','a'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
@@ -675,7 +705,8 @@ TEST(DoubleShufti, ExecMatch4) {
     lits.insert(make_pair('C','a'));
     lits.insert(make_pair('c','a'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
@@ -717,7 +748,8 @@ TEST(DoubleShufti, ExecMatch4b) {
     lits.insert(make_pair('a','C'));
     lits.insert(make_pair('a','c'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
@@ -756,7 +788,8 @@ TEST(DoubleShufti, ExecMatch5) {
 
     lits.insert(make_pair('a','A'));
 
-    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -780,7 +813,8 @@ TEST(DoubleShufti, ExecMatchMixed1) {
     // just one one-byte literal
     onebyte.set('a');
 
-    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -804,7 +838,8 @@ TEST(DoubleShufti, ExecMatchMixed2) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -838,7 +873,8 @@ TEST(DoubleShufti, ExecMatchMixed3) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, (u8 *)&lo1, (u8 *)&hi1,
+                                      (u8 *)&lo2, (u8 *)&hi2);
     ASSERT_TRUE(ret);
 
     const int len = 420;
@@ -871,7 +907,7 @@ TEST(ReverseShufti, ExecNoMatch1) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -890,7 +926,7 @@ TEST(ReverseShufti, ExecNoMatch2) {
     chars.set('a');
     chars.set('B');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -908,7 +944,7 @@ TEST(ReverseShufti, ExecNoMatch3) {
     CharReach chars;
     chars.set('V'); /* V = 0x56, e = 0x65 */
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
@@ -926,7 +962,7 @@ TEST(ReverseShufti, ExecMatch1) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -947,7 +983,7 @@ TEST(ReverseShufti, ExecMatch2) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -969,7 +1005,7 @@ TEST(ReverseShufti, ExecMatch3) {
     chars.set('a');
     chars.set('B');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -1003,7 +1039,7 @@ TEST(ReverseShufti, ExecMatch4) {
     chars.set('A');
     chars.set('c');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     /*          0123456789012345678901234567890 */
@@ -1038,7 +1074,7 @@ TEST(ReverseShufti, ExecMatch5) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -1058,7 +1094,7 @@ TEST(ReverseShufti, ExecMatch6) {
     CharReach chars;
     chars.set('a');
 
-    int ret = shuftiBuildMasks(chars, &lo, &hi);
+    int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
     const size_t len = 256;
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 3c07b2b0..7b34d92e 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -110,10 +110,10 @@ void simd_setbit(m128 *a, unsigned int i) { return setbit128(a, i); }
 void simd_setbit(m256 *a, unsigned int i) { return setbit256(a, i); }
 void simd_setbit(m384 *a, unsigned int i) { return setbit384(a, i); }
 void simd_setbit(m512 *a, unsigned int i) { return setbit512(a, i); }
-bool simd_testbit(const m128 *a, unsigned int i) { return testbit128(a, i); }
-bool simd_testbit(const m256 *a, unsigned int i) { return testbit256(a, i); }
-bool simd_testbit(const m384 *a, unsigned int i) { return testbit384(a, i); }
-bool simd_testbit(const m512 *a, unsigned int i) { return testbit512(a, i); }
+bool simd_testbit(const m128 &a, unsigned int i) { return testbit128(a, i); }
+bool simd_testbit(const m256 &a, unsigned int i) { return testbit256(a, i); }
+bool simd_testbit(const m384 &a, unsigned int i) { return testbit384(a, i); }
+bool simd_testbit(const m512 &a, unsigned int i) { return testbit512(a, i); }
 u32 simd_diffrich(const m128 &a, const m128 &b) { return diffrich128(a, b); }
 u32 simd_diffrich(const m256 &a, const m256 &b) { return diffrich256(a, b); }
 u32 simd_diffrich(const m384 &a, const m384 &b) { return diffrich384(a, b); }
@@ -419,15 +419,15 @@ TYPED_TEST(SimdUtilsTest, testbit) {
 
     // First, all bits are on in 'ones'.
     for (unsigned int i = 0; i < total_bits; i++) {
-        ASSERT_EQ(1, simd_testbit(&ones, i)) << "bit " << i << " is on";
+        ASSERT_EQ(1, simd_testbit(ones, i)) << "bit " << i << " is on";
     }
 
     // Try individual bits; only 'i' should be on.
     for (unsigned int i = 0; i < total_bits; i++) {
         TypeParam a = setbit<TypeParam>(i);
         for (unsigned int j = 0; j < total_bits; j++) {
-            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(&a, j)) << "bit " << i
-                                                           << " is wrong";
+            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i
+                                                          << " is wrong";
         }
     }
 }
@@ -470,7 +470,7 @@ TYPED_TEST(SimdUtilsTest, diffrich) {
 
     // and nothing is on in zeroes
     for (unsigned int i = 0; i < total_bits; i++) {
-        ASSERT_EQ(0, simd_testbit(&zeroes, i)) << "bit " << i << " is off";
+        ASSERT_EQ(0, simd_testbit(zeroes, i)) << "bit " << i << " is off";
     }
 
     // All-zeroes and all-ones differ in all words
@@ -614,6 +614,12 @@ TEST(SimdUtilsTest, set16x8) {
     }
 }
 
+TEST(SimdUtilsTest, set4x32) {
+    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
+    m128 simd = set4x32(cmp[0]);
+    ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
+}
+
 #if defined(__AVX2__)
 TEST(SimdUtilsTest, set32x8) {
     char cmp[sizeof(m256)];
@@ -693,4 +699,50 @@ TEST(SimdUtilsTest, variableByteShift128) {
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
 }
 
+TEST(SimdUtilsTest, max_u8_m128) {
+    char base1[] = "0123456789ABCDE\xfe";
+    char base2[] = "!!23455889aBCd\xff\xff";
+    char expec[] = "0123456889aBCd\xff\xff";
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = max_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, min_u8_m128) {
+    char base1[] = "0123456789ABCDE\xfe";
+    char base2[] = "!!23455889aBCd\xff\xff";
+    char expec[] = "!!23455789ABCDE\xfe";
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = min_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, sadd_u8_m128) {
+    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+    unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = sadd_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, sub_u8_m128) {
+    unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = sub_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
 } // namespace
diff --git a/unit/internal/truffle.cpp b/unit/internal/truffle.cpp
index 859c8a08..e9e4f19c 100644
--- a/unit/internal/truffle.cpp
+++ b/unit/internal/truffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,9 +45,9 @@ TEST(Truffle, CompileDot) {
 
     chars.setall();
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
-    CharReach out = truffle2cr(mask1, mask2);
+    CharReach out = truffle2cr((u8 *)&mask1, (u8 *)&mask2);
 
     ASSERT_EQ(out, chars);
 
@@ -64,8 +64,8 @@ TEST(Truffle, CompileChars) {
         mask2 = zeroes128();
         chars.clear();
         chars.set((u8)c);
-        truffleBuildMasks(chars, &mask1, &mask2);
-        CharReach out = truffle2cr(mask1, mask2);
+        truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
+        CharReach out = truffle2cr((u8 *)&mask1, (u8 *)&mask2);
         ASSERT_EQ(out, chars);
     }
 
@@ -74,8 +74,8 @@ TEST(Truffle, CompileChars) {
         mask1 = zeroes128();
         mask2 = zeroes128();
         chars.set((u8)c);
-        truffleBuildMasks(chars, &mask1, &mask2);
-        CharReach out = truffle2cr(mask1, mask2);
+        truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
+        CharReach out = truffle2cr((u8 *)&mask1, (u8 *)&mask2);
         ASSERT_EQ(out, chars);
     }
 
@@ -84,8 +84,8 @@ TEST(Truffle, CompileChars) {
         mask1 = zeroes128();
         mask2 = zeroes128();
         chars.clear((u8)c);
-        truffleBuildMasks(chars, &mask1, &mask2);
-        CharReach out = truffle2cr(mask1, mask2);
+        truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
+        CharReach out = truffle2cr((u8 *)&mask1, (u8 *)&mask2);
         ASSERT_EQ(out, chars);
     }
 
@@ -100,7 +100,7 @@ TEST(Truffle, ExecNoMatch1) {
 
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\xff";
 
@@ -119,7 +119,7 @@ TEST(Truffle, ExecNoMatch2) {
     chars.set('a');
     chars.set('B');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -137,7 +137,7 @@ TEST(Truffle, ExecNoMatch3) {
 
     chars.set('V'); /* V = 0x56, e = 0x65 */
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
 
@@ -154,7 +154,7 @@ TEST(Truffle, ExecMiniMatch0) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     char t1[] = "a";
 
@@ -169,7 +169,7 @@ TEST(Truffle, ExecMiniMatch1) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     char t1[] = "bbbbbbbabbb";
 
@@ -184,7 +184,7 @@ TEST(Truffle, ExecMiniMatch2) {
     CharReach chars;
     chars.set(0);
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     char t1[] = "bbbbbbb\0bbb";
 
@@ -199,7 +199,7 @@ TEST(Truffle, ExecMiniMatch3) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     char t1[] = "\0\0\0\0\0\0\0a\0\0\0";
 
@@ -214,7 +214,7 @@ TEST(Truffle, ExecMatchBig) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     std::array<u8, 400> t1;
     t1.fill('b');
@@ -234,7 +234,7 @@ TEST(Truffle, ExecMatch1) {
 
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -253,7 +253,7 @@ TEST(Truffle, ExecMatch2) {
 
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -273,7 +273,7 @@ TEST(Truffle, ExecMatch3) {
     chars.set('a');
     chars.set('B');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbBaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -295,7 +295,7 @@ TEST(Truffle, ExecMatch4) {
     chars.set('A');
     chars.set('c');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -329,7 +329,7 @@ TEST(Truffle, ExecMatch5) {
 
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -349,7 +349,7 @@ TEST(Truffle, ExecMatch6) {
     // [0-Z] - includes some graph chars
     chars.setRange('0', 'Z');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     std::array<u8, 128> t1;
     t1.fill('*'); // it's full of stars!
@@ -370,7 +370,7 @@ TEST(Truffle, ExecMatch7) {
     // hi bits
     chars.setRange(127, 255);
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     std::array<u8, 128> t1;
     t1.fill('*'); // it's full of stars!
@@ -389,7 +389,7 @@ TEST(ReverseTruffle, ExecNoMatch1) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     size_t len = strlen(t1);
@@ -408,7 +408,7 @@ TEST(ReverseTruffle, ExecNoMatch2) {
     chars.set('a');
     chars.set('B');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     size_t len = strlen(t1);
@@ -425,7 +425,7 @@ TEST(ReverseTruffle, ExecNoMatch3) {
     CharReach chars;
     chars.set('V'); /* V = 0x56, e = 0x65 */
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
     size_t len = strlen(t1);
@@ -442,7 +442,7 @@ TEST(ReverseTruffle, ExecMiniMatch0) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &lo, &hi);
+    truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
 
     char t1[] = "a";
 
@@ -457,7 +457,7 @@ TEST(ReverseTruffle, ExecMiniMatch1) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbabbbb";
@@ -475,7 +475,7 @@ TEST(ReverseTruffle, ExecMiniMatch2) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "babbbbbabbbb";
@@ -494,7 +494,7 @@ TEST(ReverseTruffle, ExecMatch1) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbabbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -514,7 +514,7 @@ TEST(ReverseTruffle, ExecMatch2) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbabbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -535,7 +535,7 @@ TEST(ReverseTruffle, ExecMatch3) {
     chars.set('a');
     chars.set('B');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaBbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -568,7 +568,7 @@ TEST(ReverseTruffle, ExecMatch4) {
     chars.set('A');
     chars.set('c');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -602,7 +602,7 @@ TEST(ReverseTruffle, ExecMatch5) {
     CharReach chars;
     chars.set('a');
 
-    truffleBuildMasks(chars, &mask1, &mask2);
+    truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     size_t len = strlen(t1);
diff --git a/unit/internal/uniform_ops.cpp b/unit/internal/uniform_ops.cpp
index 33d7cd30..10defdbd 100644
--- a/unit/internal/uniform_ops.cpp
+++ b/unit/internal/uniform_ops.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -156,26 +156,26 @@ TEST(Uniform, loadstore_m512) {
 TEST(Uniform, testbit_u32) {
     for (u32 i = 0; i < 32; i++) {
         u32 v = 0;
-        EXPECT_EQ((char)0, testbit_u32(&v, i));
+        EXPECT_EQ((char)0, testbit_u32(v, i));
         v |= 1ULL << i;
-        EXPECT_EQ((char)1, testbit_u32(&v, i));
+        EXPECT_EQ((char)1, testbit_u32(v, i));
         v = ~v;
-        EXPECT_EQ((char)0, testbit_u32(&v, i));
+        EXPECT_EQ((char)0, testbit_u32(v, i));
         v |= 1ULL << i;
-        EXPECT_EQ((char)1, testbit_u32(&v, i));
+        EXPECT_EQ((char)1, testbit_u32(v, i));
     }
 }
 
 TEST(Uniform, testbit_u64a) {
     for (u32 i = 0; i < 64; i++) {
         u64a v = 0;
-        EXPECT_EQ((char)0, testbit_u64a(&v, i));
+        EXPECT_EQ((char)0, testbit_u64a(v, i));
         v |= 1ULL << i;
-        EXPECT_EQ((char)1, testbit_u64a(&v, i));
+        EXPECT_EQ((char)1, testbit_u64a(v, i));
         v = ~v;
-        EXPECT_EQ((char)0, testbit_u64a(&v, i));
+        EXPECT_EQ((char)0, testbit_u64a(v, i));
         v |= 1ULL << i;
-        EXPECT_EQ((char)1, testbit_u64a(&v, i));
+        EXPECT_EQ((char)1, testbit_u64a(v, i));
     }
 }
 
@@ -183,7 +183,7 @@ TEST(Uniform, clearbit_u32) {
     for (u32 i = 0; i < 32; i++) {
         u32 v = ~0U;
         clearbit_u32(&v, i);
-        EXPECT_EQ((char)0, testbit_u32(&v, i));
+        EXPECT_EQ((char)0, testbit_u32(v, i));
         v = ~v;
         clearbit_u32(&v, i);
         EXPECT_EQ(0U, v);
@@ -194,7 +194,7 @@ TEST(Uniform, clearbit_u64a) {
     for (u32 i = 0; i < 64; i++) {
         u64a v = ~0ULL;
         clearbit_u64a(&v, i);
-        EXPECT_EQ((char)0, testbit_u64a(&v, i));
+        EXPECT_EQ((char)0, testbit_u64a(v, i));
         v = ~v;
         clearbit_u64a(&v, i);
         EXPECT_EQ(0ULL, v);
diff --git a/unit/internal/vermicelli.cpp b/unit/internal/vermicelli.cpp
index 5d66a332..5e4a8253 100644
--- a/unit/internal/vermicelli.cpp
+++ b/unit/internal/vermicelli.cpp
@@ -126,27 +126,29 @@ TEST(DoubleVermicelli, ExecNoMatch1) {
             const u8 *rv = vermicelliDoubleExec('a', 'b', 0, (u8 *)t1 + i,
                                                 (u8 *)t1 + strlen(t1) - j);
 
-            ASSERT_EQ(((size_t)t1 + strlen(t1) - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1 + strlen(t1) - j), (size_t)rv);
 
             rv = vermicelliDoubleExec('B', 'b', 0, (u8 *)t1 + i,
                                 (u8 *)t1 + strlen(t1) - j);
 
-            ASSERT_EQ(((size_t)t1 + strlen(t1) - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1 + strlen(t1) - j), (size_t)rv);
 
             rv = vermicelliDoubleExec('A', 'B', 1, (u8 *)t1 + i,
                                 (u8 *)t1 + strlen(t1) - j);
 
-            ASSERT_EQ(((size_t)t1 + strlen(t1) - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1 + strlen(t1) - j), (size_t)rv);
 
+            /* partial match */
             rv = vermicelliDoubleExec('b', 'B', 0, (u8 *)t1 + i,
                                 (u8 *)t1 + strlen(t1) - j);
 
-            ASSERT_EQ(((size_t)t1 + strlen(t1) - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1 + strlen(t1) - j - 1), (size_t)rv);
 
+            /* partial match */
             rv = vermicelliDoubleExec('B', 'A', 1, (u8 *)t1 + i,
                                 (u8 *)t1 + strlen(t1) - j);
 
-            ASSERT_EQ(((size_t)t1 + strlen(t1) - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1 + strlen(t1) - j - 1), (size_t)rv);
         }
     }
 }
@@ -353,30 +355,32 @@ TEST(DoubleVermicelliMasked, ExecNoMatch1) {
                                                   t1_raw + i,
                                                   t1_raw + t1.length() - i - j);
 
-            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j - 1), (size_t)rv);
-            rv = vermicelliDoubleMaskedExec('B', 'b', 0xff, CASE_CLEAR,
+            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j), (size_t)rv);
+
+            rv = vermicelliDoubleMaskedExec('B', 'B', 0xff, CASE_CLEAR,
                                             t1_raw + i,
                                             t1_raw + t1.length() - i - j);
 
-            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j), (size_t)rv);
 
             rv = vermicelliDoubleMaskedExec('A', 'B', CASE_CLEAR, CASE_CLEAR,
                                             t1_raw + i,
                                             t1_raw + t1.length() -i - j);
 
-            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j), (size_t)rv);
 
-            rv = vermicelliDoubleMaskedExec('b', 'B', CASE_CLEAR, 0xff,
+            /* partial match */
+            rv = vermicelliDoubleMaskedExec('B', 'B', CASE_CLEAR, 0xff,
                                             t1_raw + i,
                                             t1_raw + t1.length() - i - j);
 
-            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j  - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j - 1), (size_t)rv);
 
             rv = vermicelliDoubleMaskedExec('B', 'A', 0xff, 0xff,
                                             t1_raw + i,
                                             t1_raw + t1.length() - i - j);
 
-            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j - 1), (size_t)rv);
+            ASSERT_EQ(((size_t)t1_raw + t1.length() - i - j), (size_t)rv);
         }
     }
 }
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index dc731322..c0a6bc21 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -1,7 +1,10 @@
 # utility libs
 
+CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
+    ${PROJECT_SOURCE_DIR})
 
 set_source_files_properties(
     ${CMAKE_BINARY_DIR}/tools/ExpressionParser.cpp
@@ -31,3 +34,14 @@ SET(corpusomatic_SRCS
 )
 add_library(corpusomatic STATIC ${corpusomatic_SRCS})
 
+set(databaseutil_SRCS
+    database_util.cpp
+    database_util.h
+)
+add_library(databaseutil STATIC ${databaseutil_SRCS})
+
+set(crosscompileutil_SRCS
+    cross_compile.cpp
+    cross_compile.h
+    )
+add_library(crosscompileutil STATIC ${crosscompileutil_SRCS})
diff --git a/util/cross_compile.cpp b/util/cross_compile.cpp
new file mode 100644
index 00000000..b4d1f5f1
--- /dev/null
+++ b/util/cross_compile.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "cross_compile.h"
+#include "src/ue2common.h"
+#include "src/hs_compile.h"
+#include "src/util/make_unique.h"
+
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+struct XcompileMode {
+    const char *name;
+    unsigned long long cpu_features;
+};
+
+static const XcompileMode xcompile_options[] = {
+    { "avx2", HS_CPU_FEATURES_AVX2 },
+    { "base", 0 },
+};
+
+unique_ptr<hs_platform_info> xcompileReadMode(const char *s) {
+    hs_platform_info rv;
+    UNUSED hs_error_t err;
+    err = hs_populate_platform(&rv);
+    assert(!err);
+
+    string str(s);
+    string mode = str.substr(0, str.find(":"));
+    string opt = str.substr(str.find(":")+1, str.npos);
+    bool found_mode = false;
+
+    if (!opt.empty()) {
+        const size_t numOpts = ARRAY_LENGTH(xcompile_options);
+        for (size_t i = 0; i < numOpts; i++) {
+            if (opt.compare(xcompile_options[i].name) == 0) {
+                DEBUG_PRINTF("found opt %zu:%llu\n", i,
+                             xcompile_options[i].cpu_features);
+                rv.cpu_features = xcompile_options[i].cpu_features;
+                found_mode = true;
+                break;
+            }
+        }
+    }
+
+    if (!found_mode) {
+        return nullptr;
+    } else {
+        DEBUG_PRINTF("cpu_features %llx\n", rv.cpu_features);
+        return ue2::make_unique<hs_platform_info>(rv);
+    }
+}
+
+string to_string(const hs_platform_info &p) {
+    ostringstream out;
+    if (p.tune) {
+        out << p.tune;
+    }
+
+    if (p.cpu_features) {
+        u64a features = p.cpu_features;
+        if (features & HS_CPU_FEATURES_AVX2) {
+            out << " avx2";
+            features &= ~HS_CPU_FEATURES_AVX2;
+        }
+
+        if (features) {
+            out << " " << "?cpu_features?:" << features;
+        }
+    }
+
+    return out.str();
+}
+
+string xcompileUsage(void) {
+    string variants = "Instruction set options: ";
+    const size_t numOpts = ARRAY_LENGTH(xcompile_options);
+    for (size_t i = 0; i < numOpts; i++) {
+        variants += xcompile_options[i].name;
+        if (i + 1 != numOpts) {
+            variants += ", ";
+        }
+    }
+
+    return variants;
+}
diff --git a/util/cross_compile.h b/util/cross_compile.h
new file mode 100644
index 00000000..ddfc7b10
--- /dev/null
+++ b/util/cross_compile.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CROSS_COMPILE_H
+#define CROSS_COMPILE_H
+
+#include <memory>
+#include <string>
+
+struct hs_platform_info;
+
+std::unique_ptr<hs_platform_info> xcompileReadMode(const char *s);
+std::string xcompileUsage(void);
+
+std::string to_string(const hs_platform_info &p);
+
+#endif /* CROSS_COMPILE_H */
diff --git a/util/database_util.cpp b/util/database_util.cpp
new file mode 100644
index 00000000..3df75e2a
--- /dev/null
+++ b/util/database_util.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "database_util.h"
+
+#include "hs_common.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+
+#if defined(HAVE_MMAP)
+#include <sys/mman.h> // for mmap
+#include <unistd.h> // for close
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#endif
+
+using namespace std;
+
+bool saveDatabase(const hs_database_t *db, const char *filename, bool verbose) {
+    assert(db);
+    assert(filename);
+
+    if (verbose) {
+        cout << "Saving database to: " << filename << endl;
+    }
+
+    char *bytes = nullptr;
+    size_t length = 0;
+    hs_error_t err = hs_serialize_database(db, &bytes, &length);
+    if (err != HS_SUCCESS) {
+        return false;
+    }
+
+    assert(bytes);
+    assert(length > 0);
+
+    ofstream out(filename, ios::binary);
+    out.write(bytes, length);
+    out.close();
+
+    ::free(bytes);
+
+    return true;
+}
+
+hs_database_t * loadDatabase(const char *filename, bool verbose) {
+    assert(filename);
+
+    if (verbose) {
+        cout << "Loading database from: " << filename << endl;
+    }
+
+    char *bytes = nullptr;
+
+#if defined(HAVE_MMAP)
+    // Use mmap to read the file
+    int fd = open(filename, O_RDONLY);
+    if (fd < 0) {
+        return nullptr;
+    }
+    struct stat st;
+    if (fstat(fd, &st) < 0) {
+        close(fd);
+        return nullptr;
+    }
+    size_t len = st.st_size;
+
+    bytes = (char *)mmap(nullptr, len, PROT_READ, MAP_SHARED, fd, 0);
+    if (bytes == MAP_FAILED) {
+        cout << "mmap failed" << endl;
+        close(fd);
+        return nullptr;
+    }
+#else
+    // Fall back on stream IO
+    ifstream is;
+    is.open(filename, ios::in | ios::binary);
+    if (!is.is_open()) {
+        return nullptr;
+    }
+    is.seekg(0, ios::end);
+    size_t len = is.tellg();
+    if (verbose) {
+        cout << "Reading " << len << " bytes" << endl;
+    }
+    is.seekg(0, ios::beg);
+    bytes = new char[len];
+    is.read(bytes, len);
+    is.close();
+#endif
+
+    assert(bytes);
+
+    if (verbose) {
+        char *info = nullptr;
+        hs_error_t err = hs_serialized_database_info(bytes, len, &info);
+        if (err) {
+            cout << "Unable to decode serialized database info: " << err
+                 << endl;
+        } else if (info) {
+            cout << "Serialized database info: " << info << endl;
+            std::free(info);
+        } else {
+            cout << "Unable to decode serialized database info." << endl;
+        }
+    }
+
+    hs_database_t *db = nullptr;
+    hs_error_t err = hs_deserialize_database(bytes, len, &db);
+
+#if defined(HAVE_MMAP)
+    munmap(bytes, len);
+    close(fd);
+#else
+    delete [] bytes;
+#endif
+
+    if (err != HS_SUCCESS) {
+        cout << "hs_deserialize_database call failed: " << err << endl;
+        return nullptr;
+    }
+
+    assert(db);
+
+    return db;
+}
diff --git a/util/database_util.h b/util/database_util.h
new file mode 100644
index 00000000..badd036d
--- /dev/null
+++ b/util/database_util.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DATABASE_UTIL_H
+#define DATABASE_UTIL_H
+
+struct hs_database;
+
+bool saveDatabase(const hs_database *db, const char *filename,
+                  bool verbose = false);
+
+hs_database *loadDatabase(const char *filename, bool verbose = false);
+
+#endif /* DATABASE_UTIL_H */
diff --git a/util/expression_path.h b/util/expression_path.h
new file mode 100644
index 00000000..3075b4d4
--- /dev/null
+++ b/util/expression_path.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EXPRESSION_PATH_H
+#define EXPRESSION_PATH_H
+
+#include "ue2common.h"
+
+#include <cerrno>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <sys/stat.h>
+#if !defined(_WIN32)
+#include <unistd.h>
+#include <libgen.h>
+#endif
+
+//
+// Utility functions
+//
+
+/**
+ * Given a path to a signature file, infer the path of the pcre directory.
+ */
+static inline
+std::string inferExpressionPath(const std::string &sigFile) {
+#ifndef _WIN32
+    // POSIX variant.
+
+    // dirname() may modify its argument, so we must make a copy.
+    std::vector<char> path(sigFile.size() + 1);
+    memcpy(path.data(), sigFile.c_str(), sigFile.size());
+    path[sigFile.size()] = 0; // ensure null termination.
+
+    std::string rv = dirname(path.data());
+#else
+    // Windows variant.
+    if (sigFile.size() >= _MAX_DIR) {
+        return std::string();
+    }
+    char path[_MAX_DIR];
+    _splitpath(sigFile.c_str(), nullptr, path, nullptr, nullptr);
+    std::string rv(path);
+#endif
+
+    rv += "/../pcre";
+    return rv;
+}
+
+#if defined(_WIN32)
+#define stat _stat
+#define S_IFREG _S_IFREG
+#endif
+
+static inline
+bool isDir(const std::string &filename) {
+    struct stat s;
+
+    if (stat(filename.c_str(), &s) == -1) {
+        std::cerr << "stat: " << strerror(errno) << std::endl;
+        return false;
+    }
+
+    return (S_IFDIR & s.st_mode);
+}
+
+static inline
+bool isFile(const std::string &filename) {
+    struct stat s;
+
+    if (stat(filename.c_str(), &s) == -1) {
+        std::cerr << "stat: " << strerror(errno) << std::endl;
+        return false;
+    }
+
+    return (S_IFREG & s.st_mode);
+}
+
+#endif /* EXPRESSION_PATH_H */
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 9fa6743e..ca7c413a 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -144,7 +144,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
 
     ue2::unordered_set<NFAVertex> one_way_in;
     for (const auto &v : vertices_range(g)) {
-        if (!hasGreaterInDegree(1, v, g)) {
+        if (in_degree(v, g) <= 1) {
             one_way_in.insert(v);
         }
     }
@@ -155,7 +155,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
         ptr_vector<VertexPath>::auto_type p = open.pop_back();
         NFAVertex u = p->back();
 
-        DEBUG_PRINTF("dequeuing path %s, back %u\n",
+        DEBUG_PRINTF("dequeuing path %s, back %zu\n",
                      pathToString(g, *p).c_str(), g[u].index);
 
         NGHolder::adjacency_iterator ai, ae;
@@ -187,7 +187,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
                 // Note that vertices that only have one predecessor don't need
                 // their cycle limit checked, as their predecessors will have
                 // the same count.
-                DEBUG_PRINTF("exceeded cycle limit for v=%u, pruning path\n",
+                DEBUG_PRINTF("exceeded cycle limit for v=%zu, pruning path\n",
                              g[v].index);
                 continue;
             }
@@ -301,7 +301,7 @@ void CorpusGeneratorImpl::addRandom(const min_max &mm, string *out) {
 }
 
 unsigned char CorpusGeneratorImpl::getChar(NFAVertex v) {
-    const CharReach &cr = graph.g[v].char_reach;
+    const CharReach &cr = graph[v].char_reach;
 
     switch (cProps.throwDice()) {
     case CorpusProperties::ROLLED_MATCH:
@@ -521,7 +521,7 @@ CorpusGeneratorUtf8::pathToCorpus(const vector<CodePointSet> &path) {
 }
 
 static
-u32 classify_vertex(const NFAGraph &g, NFAVertex v) {
+u32 classify_vertex(const NGHolder &g, NFAVertex v) {
     const CharReach &cr = g[v].char_reach;
     if (cr.isSubsetOf(UTF_ASCII_CR)) {
         return 1;
@@ -560,7 +560,7 @@ void expandCodePointSet(const CharReach &cr, CodePointSet *out, u32 mask,
 }
 
 static
-void decodePath(const NFAGraph &g, const VertexPath &in,
+void decodePath(const NGHolder &g, const VertexPath &in,
                 vector<CodePointSet> &out) {
     VertexPath::const_iterator it = in.begin();
     while (it != in.end()) {
@@ -618,7 +618,7 @@ void translatePaths(const NGHolder &graph,
     assert(out);
     for (const auto &path : allPathsTemp) {
         out->push_back(vector<CodePointSet>());
-        decodePath(graph.g, path, out->back());
+        decodePath(graph, path, out->back());
     }
 }
 
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index 60ff0a17..2b337365 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -34,7 +34,7 @@
 
 #include "ng_find_matches.h"
 
-#include "nfagraph/ng_graph.h"
+#include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_util.h"
 #include "parser/position.h"
 #include "util/container.h"