diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19a37b8e..7dc0fd79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,36 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [4.3.0] 2016-08-24
+- Introduce a new analysis pass ("Violet") used for decomposition of patterns
+  into literals and smaller engines.
+- Introduce a new container engine ("Tamarama") for infix and suffix engines
+  that can be proven to run exclusively of one another. This reduces stream
+  state for pattern sets with many such engines.
+- Introduce a new shuffle-based DFA engine ("Sheng"). This improves scanning
+  performance for pattern sets where small engines are generated.
+- Improve the analysis used to extract extra mask information from short
+  literals.
+- Reduced compile time spent in equivalence class analysis.
+- Build: frame pointers are now only omitted for 32-bit release builds.
+- Build: Workaround for C++ issues reported on FreeBSD/libc++ platforms.
+  (github issue #27)
+- Simplify the LimEx NFA with a unified "variable shift" model, which reduces
+  the number of different NFA code paths to one per model size.
+- Allow some anchored prefixes that may squash the literal to which they are
+  attached to run eagerly. This improves scanning performance for some
+  patterns.
+- Simplify and improve EOD ("end of data") matching, using the interpreter for
+  all operations.
+- Elide unnecessary instructions in the Rose interpreter at compile time.
+- Reduce the number of inlined instantiations of the Rose interpreter in order
+  to reduce instruction cache pressure.
+- Small improvements to literal matcher acceleration.
+- Parser: ignore `\E` metacharacters that are not preceded by `\Q`. This
+  conforms to PCRE's behaviour, rather than returning a compile error.
+- Check for misaligned memory when allocating an error structure in Hyperscan's
+  compile path and return an appropriate error if detected.
+
 ## [4.2.0] 2016-05-31
 - Introduce an interpreter for many complex actions to replace the use of
   internal reports within the core of Hyperscan (the "Rose" engine). This
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c824b6a6..abbfe53b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,18 @@
 cmake_minimum_required (VERSION 2.8.11)
+
+# don't use the built-in default configs
+set (CMAKE_NOT_USING_CONFIG_FLAGS TRUE)
+
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 2)
+set (HS_MINOR_VERSION 3)
 set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
-string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
+# since we are doing this manually, we only have three types
+set (CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo"
+     CACHE STRING "" FORCE)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
@@ -24,7 +30,7 @@ find_package(PkgConfig QUIET)
 
 if (NOT CMAKE_BUILD_TYPE)
     message(STATUS "Default build type 'Release with debug info'")
-    set(CMAKE_BUILD_TYPE "RELWITHDEBINFO")
+    set(CMAKE_BUILD_TYPE RELWITHDEBINFO CACHE STRING "" FORCE )
 else()
     string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
     message(STATUS "Build type ${CMAKE_BUILD_TYPE}")
@@ -90,6 +96,18 @@ else()
     message(FATAL_ERROR "No python interpreter found")
 endif()
 
+# allow for reproducible builds - python for portability
+if (DEFINED ENV{SOURCE_DATE_EPOCH})
+      execute_process(
+          COMMAND "${PYTHON}" "${CMAKE_MODULE_PATH}/formatdate.py" "$ENV{SOURCE_DATE_EPOCH}"
+          OUTPUT_VARIABLE BUILD_DATE
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+else ()
+    string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
+endif ()
+message(STATUS "Build date: ${BUILD_DATE}")
+
+
 if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
     message(FATAL_ERROR "Ragel state machine compiler not found")
 endif()
@@ -121,13 +139,7 @@ endif()
 
 CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
 
-option(DISABLE_ASSERTS "Disable assert(); enabled in debug builds, disabled in release builds" FALSE)
-
-if (DISABLE_ASSERTS)
-    if (CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-        add_definitions(-DNDEBUG)
-    endif()
-endif()
+CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
 option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
 
@@ -139,18 +151,26 @@ if(MSVC OR MSVC_IDE)
     if (MSVC_VERSION LESS 1700)
         message(FATAL_ERROR "The project requires C++11 features.")
     else()
+        # set base flags
+        set(CMAKE_C_FLAGS "/DWIN32 /D_WINDOWS /W3")
+        set(CMAKE_C_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
+        set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
+        set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
+
+        set(CMAKE_CXX_FLAGS "/DWIN32 /D_WINDOWS /W3 /GR /EHsc")
+        set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
+        set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
+
         if (WINDOWS_ICC)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /O3 /wd4267 /Qdiag-disable:remark")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /O2 /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         else()
             #TODO: don't hardcode arch
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /O2 /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /O2 /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         endif()
-        string(REGEX REPLACE "/RTC1" ""
-            CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" )
-        string(REGEX REPLACE "/RTC1" ""
-            CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" )
+
 
     endif()
 
@@ -172,16 +192,34 @@ else()
         unset(_GXX_OUTPUT)
     endif()
 
-    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual")
-    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
-    if (NOT RELEASE_BUILD)
-        # -Werror is most useful during development, don't potentially break
-        # release builds
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
+    if(OPTIMISE)
+        set(OPT_C_FLAG "-O3")
+        set(OPT_CXX_FLAG "-O2")
+    else()
+        set(OPT_C_FLAG "-O0")
+        set(OPT_CXX_FLAG "-O0")
+    endif(OPTIMISE)
+
+    # set up base flags for build types
+    set(CMAKE_C_FLAGS_DEBUG "-g ${OPT_C_FLAG} -Werror")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-g ${OPT_C_FLAG}")
+    set(CMAKE_C_FLAGS_RELEASE "${OPT_C_FLAG}")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "-g ${OPT_CXX_FLAG} -Werror")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g ${OPT_CXX_FLAG}")
+    set(CMAKE_CXX_FLAGS_RELEASE "${OPT_CXX_FLAG}")
+
+    if (DISABLE_ASSERTS)
+        # usually true for release builds, false for debug
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DNDEBUG")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
     endif()
 
+
+    # set compiler flags - more are tested and added later
+    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+
     if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
         message(STATUS "Building for current host CPU")
         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -march=native -mtune=native")
@@ -199,15 +237,7 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0 -Wno-unused-local-typedefs -Wno-maybe-uninitialized")
     endif()
 
-    if(OPTIMISE)
-        set(EXTRA_C_FLAGS "-O3 ${EXTRA_C_FLAGS}")
-        set(EXTRA_CXX_FLAGS "-O2 ${EXTRA_CXX_FLAGS}")
-    else()
-        set(EXTRA_C_FLAGS "-O0 ${EXTRA_C_FLAGS}")
-        set(EXTRA_CXX_FLAGS "-O0 ${EXTRA_CXX_FLAGS}")
-    endif(OPTIMISE)
-
-    if(NOT RELEASE_BUILD)
+    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
     endif()
@@ -297,6 +327,11 @@ if (CXX_UNUSED_CONST_VAR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
 endif()
 
+# gcc 6 complains about type attributes that get ignored, like alignment
+CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
+if (CXX_IGNORED_ATTR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
+endif()
 
 # note this for later
 # g++ doesn't have this flag but clang does
@@ -438,15 +473,14 @@ set (hs_exec_SRCS
     src/nfa/limex_simd128.c
     src/nfa/limex_simd256.c
     src/nfa/limex_simd384.c
-    src/nfa/limex_simd512a.c
-    src/nfa/limex_simd512b.c
-    src/nfa/limex_simd512c.c
+    src/nfa/limex_simd512.c
     src/nfa/limex.h
     src/nfa/limex_common_impl.h
     src/nfa/limex_context.h
     src/nfa/limex_internal.h
     src/nfa/limex_runtime.h
     src/nfa/limex_runtime_impl.h
+    src/nfa/limex_shuffle.h
     src/nfa/limex_state_impl.h
     src/nfa/mpv.h
     src/nfa/mpv.c
@@ -477,9 +511,18 @@ set (hs_exec_SRCS
     src/nfa/repeat.c
     src/nfa/repeat.h
     src/nfa/repeat_internal.h
+    src/nfa/sheng.c
+    src/nfa/sheng.h
+    src/nfa/sheng_defs.h
+    src/nfa/sheng_impl.h
+    src/nfa/sheng_impl4.h
+    src/nfa/sheng_internal.h
     src/nfa/shufti_common.h
     src/nfa/shufti.c
     src/nfa/shufti.h
+    src/nfa/tamarama.c
+    src/nfa/tamarama.h
+    src/nfa/tamarama_internal.h
     src/nfa/truffle_common.h
     src/nfa/truffle.c
     src/nfa/truffle.h
@@ -495,7 +538,6 @@ set (hs_exec_SRCS
     src/rose/block.c
     src/rose/catchup.h
     src/rose/catchup.c
-    src/rose/eod.c
     src/rose/infix.h
     src/rose/init.h
     src/rose/init.c
@@ -503,6 +545,7 @@ set (hs_exec_SRCS
     src/rose/match.h
     src/rose/match.c
     src/rose/miracle.h
+    src/rose/program_runtime.c
     src/rose/program_runtime.h
     src/rose/runtime.h
     src/rose/rose.h
@@ -510,6 +553,7 @@ set (hs_exec_SRCS
     src/rose/rose_program.h
     src/rose/rose_types.h
     src/rose/rose_common.h
+    src/rose/validate_mask.h
     src/util/bitutils.h
     src/util/exhaust.h
     src/util/fatbit.h
@@ -524,11 +568,8 @@ set (hs_exec_SRCS
     src/util/pqueue.h
     src/util/scatter.h
     src/util/scatter_runtime.h
-    src/util/shuffle.h
-    src/util/shuffle_ssse3.h
     src/util/simd_utils.h
-    src/util/simd_utils_ssse3.h
-    src/util/simd_utils_ssse3.c
+    src/util/simd_utils.c
     src/util/state_compress.h
     src/util/state_compress.c
     src/util/unaligned.h
@@ -597,11 +638,15 @@ SET (hs_SRCS
     src/hwlm/noodle_build.h
     src/hwlm/noodle_internal.h
     src/nfa/accel.h
+    src/nfa/accel_dfa_build_strat.cpp
+    src/nfa/accel_dfa_build_strat.h
     src/nfa/accelcompile.cpp
     src/nfa/accelcompile.h
     src/nfa/callback.h
     src/nfa/castlecompile.cpp
     src/nfa/castlecompile.h
+    src/nfa/dfa_build_strat.cpp
+    src/nfa/dfa_build_strat.h
     src/nfa/dfa_min.cpp
     src/nfa/dfa_min.h
     src/nfa/goughcompile.cpp
@@ -613,8 +658,6 @@ SET (hs_SRCS
     src/nfa/mcclellan_internal.h
     src/nfa/mcclellancompile.cpp
     src/nfa/mcclellancompile.h
-    src/nfa/mcclellancompile_accel.cpp
-    src/nfa/mcclellancompile_accel.h
     src/nfa/mcclellancompile_util.cpp
     src/nfa/mcclellancompile_util.h
     src/nfa/limex_compile.cpp
@@ -639,8 +682,13 @@ SET (hs_SRCS
     src/nfa/repeat_internal.h
     src/nfa/repeatcompile.cpp
     src/nfa/repeatcompile.h
+    src/nfa/sheng_internal.h
+    src/nfa/shengcompile.cpp
+    src/nfa/shengcompile.h
     src/nfa/shufticompile.cpp
     src/nfa/shufticompile.h
+    src/nfa/tamaramacompile.cpp
+    src/nfa/tamaramacompile.h
     src/nfa/trufflecompile.cpp
     src/nfa/trufflecompile.h
     src/nfagraph/ng.cpp
@@ -746,6 +794,8 @@ SET (hs_SRCS
     src/nfagraph/ng_util.h
     src/nfagraph/ng_vacuous.cpp
     src/nfagraph/ng_vacuous.h
+    src/nfagraph/ng_violet.cpp
+    src/nfagraph/ng_violet.h
     src/nfagraph/ng_width.cpp
     src/nfagraph/ng_width.h
     src/parser/AsciiComponentClass.cpp
@@ -825,6 +875,10 @@ SET (hs_SRCS
     src/rose/rose_build_compile.cpp
     src/rose/rose_build_convert.cpp
     src/rose/rose_build_convert.h
+    src/rose/rose_build_exclusive.cpp
+    src/rose/rose_build_exclusive.h
+    src/rose/rose_build_groups.cpp
+    src/rose/rose_build_groups.h
     src/rose/rose_build_impl.h
     src/rose/rose_build_infix.cpp
     src/rose/rose_build_infix.h
@@ -853,6 +907,8 @@ SET (hs_SRCS
     src/util/charreach.cpp
     src/util/charreach.h
     src/util/charreach_util.h
+    src/util/clique.cpp
+    src/util/clique.h
     src/util/compare.h
     src/util/compile_context.cpp
     src/util/compile_context.h
@@ -878,7 +934,6 @@ SET (hs_SRCS
     src/util/report_manager.cpp
     src/util/report_manager.h
     src/util/simd_utils.h
-    src/util/simd_utils_ssse3.h
     src/util/target_info.cpp
     src/util/target_info.h
     src/util/ue2_containers.h
@@ -916,6 +971,10 @@ set(hs_dump_SRCS
     src/nfa/nfa_dump_dispatch.cpp
     src/nfa/nfa_dump_internal.cpp
     src/nfa/nfa_dump_internal.h
+    src/nfa/shengdump.cpp
+    src/nfa/shengdump.h
+    src/nfa/tamarama_dump.cpp
+    src/nfa/tamarama_dump.h
     src/parser/dump.cpp
     src/parser/dump.h
     src/parser/position_dump.h
@@ -941,7 +1000,7 @@ endif()
 # choose which ones to build
 
 set (LIB_VERSION ${HS_VERSION})
-set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
+set (LIB_SOVERSION ${HS_MAJOR_VERSION})
 
 add_library(hs_exec OBJECT ${hs_exec_SRCS})
 
diff --git a/cmake/formatdate.py b/cmake/formatdate.py
new file mode 100755
index 00000000..1b9c62d2
--- /dev/null
+++ b/cmake/formatdate.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import os
+import sys
+import datetime
+
+def usage():
+    print("Usage:", os.path.basename(sys.argv[0]), "<seconds from epoch>")
+
+if len(sys.argv) != 2:
+    usage()
+    sys.exit(1)
+
+ts = sys.argv[1]
+
+build_date = datetime.datetime.utcfromtimestamp(int(ts))
+
+print(build_date.strftime("%Y-%m-%d"))
diff --git a/examples/simplegrep.c b/examples/simplegrep.c
index 9e392a8f..d6bd4b39 100644
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@@ -77,7 +77,7 @@ static int eventHandler(unsigned int id, unsigned long long from,
  * length with its length. Returns NULL on failure.
  */
 static char *readInputData(const char *inputFN, unsigned int *length) {
-    FILE *f = fopen(inputFN, "r");
+    FILE *f = fopen(inputFN, "rb");
     if (!f) {
         fprintf(stderr, "ERROR: unable to open file \"%s\": %s\n", inputFN,
                 strerror(errno));
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index ce5f8723..d56aff88 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,7 +52,6 @@
 #include "parser/shortcut_literal.h"
 #include "parser/unsupported.h"
 #include "parser/utf8_validate.h"
-#include "smallwrite/smallwrite_build.h"
 #include "rose/rose_build.h"
 #include "rose/rose_build_dump.h"
 #include "som/slot_manager_dump.h"
@@ -304,15 +303,6 @@ aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
         return nullptr;
     }
 
-    /* avoid building a smwr if just a pure floating case. */
-    if (!roseIsPureLiteral(rose.get())) {
-        u32 qual = roseQuality(rose.get());
-        auto smwr = ng.smwr->build(qual);
-        if (smwr) {
-            rose = roseAddSmallWrite(rose.get(), smwr.get());
-        }
-    }
-
     dumpRose(*ng.rose, rose.get(), ng.cc.grey);
     dumpReportManager(ng.rm, ng.cc.grey);
     dumpSomSlotManager(ng.ssm, ng.cc.grey);
diff --git a/src/compiler/error.cpp b/src/compiler/error.cpp
index e806b7a0..07db9819 100644
--- a/src/compiler/error.cpp
+++ b/src/compiler/error.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@ using std::string;
 
 static const char failureNoMemory[] = "Unable to allocate memory.";
 static const char failureInternal[] = "Internal error.";
+static const char failureBadAlloc[] = "Allocator returned misaligned memory.";
 
 extern const hs_compile_error_t hs_enomem = {
     const_cast<char *>(failureNoMemory), 0
@@ -49,6 +50,9 @@ extern const hs_compile_error_t hs_enomem = {
 extern const hs_compile_error_t hs_einternal = {
     const_cast<char *>(failureInternal), 0
 };
+extern const hs_compile_error_t hs_badalloc = {
+    const_cast<char *>(failureBadAlloc), 0
+};
 
 namespace ue2 {
 
@@ -56,8 +60,18 @@ hs_compile_error_t *generateCompileError(const string &err, int expression) {
     hs_compile_error_t *ret =
         (struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
     if (ret) {
+        hs_error_t e = hs_check_alloc(ret);
+        if (e != HS_SUCCESS) {
+            hs_misc_free(ret);
+            return const_cast<hs_compile_error_t *>(&hs_badalloc);
+        }
         char *msg = (char *)hs_misc_alloc(err.size() + 1);
         if (msg) {
+            e = hs_check_alloc(msg);
+            if (e != HS_SUCCESS) {
+                hs_misc_free(msg);
+                return const_cast<hs_compile_error_t *>(&hs_badalloc);
+            }
             memcpy(msg, err.c_str(), err.size() + 1);
             ret->message = msg;
         } else {
@@ -83,7 +97,8 @@ void freeCompileError(hs_compile_error_t *error) {
     if (!error) {
         return;
     }
-    if (error == &hs_enomem || error == &hs_einternal) {
+    if (error == &hs_enomem || error == &hs_einternal ||
+        error == &hs_badalloc) {
         // These are not allocated.
         return;
     }
diff --git a/src/database.c b/src/database.c
index 635a3b66..a4e10c22 100644
--- a/src/database.c
+++ b/src/database.c
@@ -458,33 +458,16 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
     }
     *info = NULL;
 
-    if (!bytes || length < sizeof(struct hs_database)) {
-        return HS_INVALID;
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
     }
 
-    const u32 *buf = (const u32 *)bytes;
+    u32 mode = unaligned_load_u32(bytes + offsetof(struct RoseEngine, mode));
 
-    u32 magic = unaligned_load_u32(buf++);
-    if (magic != HS_DB_MAGIC) {
-        return HS_INVALID;
-    }
-
-    u32 version = unaligned_load_u32(buf++);
-
-    buf++; /* length */
-
-    platform_t plat;
-    plat = unaligned_load_u64a(buf);
-    buf += 2;
-
-    buf++; /* crc */
-    buf++; /* reserved 0 */
-    buf++; /* reserved 1 */
-
-    const char *t_raw = (const char *)buf;
-    u32 mode = unaligned_load_u32(t_raw + offsetof(struct RoseEngine, mode));
-
-    return print_database_string(info, version, plat, mode);
+    return print_database_string(info, header.version, header.platform, mode);
 }
 
 HS_PUBLIC_API
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index ff69853e..4230c2b1 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -36,7 +36,6 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
@@ -132,7 +131,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
         u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
         tmp &= fdr->domainMask;
         s = *((const m128 *)ft + tmp);
-        s = shiftRight8Bits(s);
+        s = rshiftbyte_m128(s, 1);
     } else {
         s = fdr->start;
     }
@@ -186,20 +185,20 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st14 = *(const m128 *)(ft + v14*8);
     m128 st15 = *(const m128 *)(ft + v15*8);
 
-    st1 = byteShiftLeft128(st1, 1);
-    st2 = byteShiftLeft128(st2, 2);
-    st3 = byteShiftLeft128(st3, 3);
-    st4 = byteShiftLeft128(st4, 4);
-    st5 = byteShiftLeft128(st5, 5);
-    st6 = byteShiftLeft128(st6, 6);
-    st7 = byteShiftLeft128(st7, 7);
-    st9 = byteShiftLeft128(st9, 1);
-    st10 = byteShiftLeft128(st10, 2);
-    st11 = byteShiftLeft128(st11, 3);
-    st12 = byteShiftLeft128(st12, 4);
-    st13 = byteShiftLeft128(st13, 5);
-    st14 = byteShiftLeft128(st14, 6);
-    st15 = byteShiftLeft128(st15, 7);
+    st1 = lshiftbyte_m128(st1, 1);
+    st2 = lshiftbyte_m128(st2, 2);
+    st3 = lshiftbyte_m128(st3, 3);
+    st4 = lshiftbyte_m128(st4, 4);
+    st5 = lshiftbyte_m128(st5, 5);
+    st6 = lshiftbyte_m128(st6, 6);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
 
     *s = or128(*s, st0);
     *s = or128(*s, st1);
@@ -210,7 +209,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st6);
     *s = or128(*s, st7);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
@@ -222,7 +221,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st14);
     *s = or128(*s, st15);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
@@ -253,19 +252,19 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st12 = *(const m128 *)(ft + v12*8);
     m128 st14 = *(const m128 *)(ft + v14*8);
 
-    st2 = byteShiftLeft128(st2, 2);
-    st4 = byteShiftLeft128(st4, 4);
-    st6 = byteShiftLeft128(st6, 6);
-    st10 = byteShiftLeft128(st10, 2);
-    st12 = byteShiftLeft128(st12, 4);
-    st14 = byteShiftLeft128(st14, 6);
+    st2  = lshiftbyte_m128(st2, 2);
+    st4  = lshiftbyte_m128(st4, 4);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
 
     *s = or128(*s, st0);
     *s = or128(*s, st2);
     *s = or128(*s, st4);
     *s = or128(*s, st6);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
@@ -273,7 +272,7 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st12);
     *s = or128(*s, st14);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
@@ -296,27 +295,26 @@ void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st8 = *(const m128 *)(ft + v8*8);
     m128 st12 = *(const m128 *)(ft + v12*8);
 
-    st4 = byteShiftLeft128(st4, 4);
-    st12 = byteShiftLeft128(st12, 4);
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
 
     *s = or128(*s, st0);
     *s = or128(*s, st4);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
     *s = or128(*s, st12);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
 static really_inline
-void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                     const u32 *confBase, const struct FDR_Runtime_Args *a,
-                    const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id,
-                    struct zone *z) {
+                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
     const u8 bucket = 8;
     const u8 pullback = 1;
 
@@ -352,13 +350,13 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
                 continue;
             }
            *last_match_id = id;
-           *controlVal = a->cb(ptr_main + byte - a->buf,
-                               ptr_main + byte - a->buf, id, a->ctxt);
+           *control = a->cb(ptr_main + byte - a->buf, ptr_main + byte - a->buf,
+                            id, a->ctxt);
            continue;
         }
         u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
-        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback,
-                    control, last_match_id, confVal);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback, control,
+                    last_match_id, confVal);
     } while (unlikely(!!*conf));
 }
 
@@ -681,9 +679,9 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
             itPtr += ITER_BYTES) {                                          \
             if (unlikely(itPtr > tryFloodDetect)) {                         \
                 tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
-                                             &floodBackoff, &controlVal,    \
+                                             &floodBackoff, &control,       \
                                              ITER_BYTES);                   \
-                if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {      \
+                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {         \
                     return HWLM_TERMINATED;                                 \
                 }                                                           \
             }                                                               \
@@ -692,11 +690,11 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
             u64a conf8;                                                     \
             get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted,    \
                         ft, &conf0, &conf8, &s);                            \
-            do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr,      \
-                           control, &last_match_id, zz);                    \
-            do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr,      \
-                           control, &last_match_id, zz);                    \
-            if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {          \
+            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {             \
                 return HWLM_TERMINATED;                                     \
             }                                                               \
         } /* end for loop */                                                \
@@ -704,9 +702,8 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
-                             const struct FDR_Runtime_Args *a) {
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+                             const struct FDR_Runtime_Args *a,
+                             hwlm_group_t control) {
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
     u64a domain_mask_adjusted = fdr->domainMask << 1;
@@ -771,7 +768,10 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
 #define ONLY_AVX2(func) NULL
 #endif
 
-typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
+                                    const struct FDR_Runtime_Args *a,
+                                    hwlm_group_t control);
+
 static const FDRFUNCTYPE funcs[] = {
     fdr_engine_exec,
     ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
@@ -814,7 +814,6 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         start,
         cb,
         ctxt,
-        &groups,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         0
     };
@@ -822,7 +821,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         return HWLM_SUCCESS;
     } else {
         assert(funcs[fdr->engineID]);
-        return funcs[fdr->engineID](fdr, &a);
+        return funcs[fdr->engineID](fdr, &a, groups);
     }
 }
 
@@ -840,7 +839,6 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         start,
         cb,
         ctxt,
-        &groups,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         /* we are guaranteed to always have 16 initialised bytes at the end of
          * the history buffer (they may be garbage). */
@@ -853,7 +851,7 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         ret = HWLM_SUCCESS;
     } else {
         assert(funcs[fdr->engineID]);
-        ret = funcs[fdr->engineID](fdr, &a);
+        ret = funcs[fdr->engineID](fdr, &a, groups);
     }
 
     fdrPackState(fdr, &a, stream_state);
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 0c4ef35d..89a0ff72 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -81,7 +81,7 @@ private:
     void dumpMasks(const u8 *defaultMask);
 #endif
     void setupTab();
-    aligned_unique_ptr<FDR> setupFDR(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link);
     void createInitialState(FDR *fdr);
 
 public:
@@ -90,7 +90,7 @@ public:
         : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
           make_small(make_small_in) {}
 
-    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
 };
 
 u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
@@ -124,10 +124,8 @@ void FDRCompiler::createInitialState(FDR *fdr) {
         // Find the minimum length for the literals in this bucket.
         const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
         u32 min_len = ~0U;
-        for (vector<LiteralIndex>::const_iterator it = bucket_lits.begin(),
-                                                  ite = bucket_lits.end();
-             it != ite; ++it) {
-            min_len = min(min_len, verify_u32(lits[*it].s.length()));
+        for (const LiteralIndex &lit_idx : bucket_lits) {
+            min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
         }
 
         DEBUG_PRINTF("bucket %u has min_len=%u\n", b, min_len);
@@ -141,13 +139,12 @@ void FDRCompiler::createInitialState(FDR *fdr) {
     }
 }
 
-aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
     size_t tabSize = eng.getTabSizeBytes();
 
-    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
-
-    pair<u8 *, size_t> confirmTmp =
-        setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng);
+    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
 
     assert(ISALIGNED_16(tabSize));
     assert(ISALIGNED_16(confirmTmp.second));
@@ -175,14 +172,12 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     copy(tab.begin(), tab.end(), ptr);
     ptr += tabSize;
 
-    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
     ptr += confirmTmp.second;
-    aligned_free(confirmTmp.first);
 
     fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
     ptr += floodControlTmp.second;
-    aligned_free(floodControlTmp.first);
 
     /*  we are allowing domains 9 to 15 only */
     assert(eng.bits > 8 && eng.bits < 16);
@@ -193,8 +188,7 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
 
     if (link.first) {
         fdr->link = verify_u32(ptr - fdr_base);
-        memcpy(ptr, link.first, link.second);
-        aligned_free(link.first);
+        memcpy(ptr, link.first.get(), link.second);
     } else {
         fdr->link = 0;
     }
@@ -217,13 +211,11 @@ struct LitOrder {
         if (len1 != len2) {
             return len1 < len2;
         } else {
-            string::const_reverse_iterator it1, it2;
-            tie(it1, it2) =
-                std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
-            if (it1 == i1s.rend()) {
+            auto p = std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
+            if (p.first == i1s.rend()) {
                 return false;
             }
-            return *it1 < *it2;
+            return *p.first < *p.second;
         }
     }
 
@@ -266,9 +258,8 @@ void FDRCompiler::assignStringsToBuckets() {
     stable_sort(vli.begin(), vli.end(), LitOrder(lits));
 
 #ifdef DEBUG_ASSIGNMENT
-    for (map<u32, u32>::iterator i = lenCounts.begin(), e = lenCounts.end();
-         i != e; ++i) {
-        printf("l<%d>:%d ", i->first, i->second);
+    for (const auto &m : lenCounts) {
+        printf("l<%u>:%u ", m.first, m.second);
     }
     printf("\n");
 #endif
@@ -324,12 +315,12 @@ void FDRCompiler::assignStringsToBuckets() {
         for (u32 k = j; k < nChunks; ++k) {
             cnt += count[k];
         }
-        t[j][0] = make_pair(getScoreUtil(length[j], cnt), 0);
+        t[j][0] = {getScoreUtil(length[j], cnt), 0};
     }
 
     for (u32 i = 1; i < nb; i++) {
         for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
-            SCORE_INDEX_PAIR best = make_pair(MAX_SCORE, 0);
+            SCORE_INDEX_PAIR best = {MAX_SCORE, 0};
             u32 cnt = count[j];
             for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
                 SCORE score = getScoreUtil(length[j], cnt);
@@ -338,12 +329,12 @@ void FDRCompiler::assignStringsToBuckets() {
                 }
                 score += t[k][i-1].first;
                 if (score < best.first) {
-                    best = make_pair(score, k);
+                    best = {score, k};
                 }
             }
             t[j][i] = best;
         }
-        t[nChunks - 1][i] = make_pair(0,0); // fill in empty final row for next iteration
+        t[nChunks - 1][i] = {0,0}; // fill in empty final row for next iteration
     }
 
 #ifdef DEBUG_ASSIGNMENT
@@ -405,8 +396,7 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
         distance = 4;
     }
 
-    for (vector<LiteralIndex>::const_iterator i = vl.begin(), e = vl.end();
-         i != e; ++i) {
+    for (auto i = vl.begin(), e = vl.end(); i != e; ++i) {
         if (e - i > 5) {
             __builtin_prefetch(&lits[*(i + 5)]);
         }
@@ -460,31 +450,25 @@ void FDRCompiler::setupTab() {
         memcpy(tabIndexToMask(i), &defaultMask[0], mask_size);
     }
 
-    typedef std::map<u32, ue2::unordered_set<u32> > M2SET;
-
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
         const vector<LiteralIndex> &vl = bucketToLits[b];
         SuffixPositionInString pLimit = eng.getBucketWidth(b);
         for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
             u32 bit = eng.getSchemeBit(b, pos);
-            M2SET m2;
+            map<u32, ue2::unordered_set<u32>> m2;
             bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
             if (done) {
                 clearbit(&defaultMask[0], bit);
                 continue;
             }
-            for (M2SET::const_iterator i = m2.begin(), e = m2.end(); i != e;
-                 ++i) {
-                u32 dc = i->first;
-                const ue2::unordered_set<u32> &mskSet = i->second;
+            for (const auto &elem : m2) {
+                u32 dc = elem.first;
+                const ue2::unordered_set<u32> &mskSet = elem.second;
                 u32 v = ~dc;
                 do {
                     u32 b2 = v & dc;
-                    for (ue2::unordered_set<u32>::const_iterator
-                             i2 = mskSet.begin(),
-                             e2 = mskSet.end();
-                         i2 != e2; ++i2) {
-                        u32 val = (*i2 & ~dc) | b2;
+                    for (const u32 &mskVal : mskSet) {
+                        u32 val = (mskVal & ~dc) | b2;
                         clearbit(tabIndexToMask(val), bit);
                     }
                     v = (v + (dc & -dc)) | ~dc;
@@ -502,7 +486,8 @@ void FDRCompiler::setupTab() {
 #endif
 }
 
-aligned_unique_ptr<FDR> FDRCompiler::build(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+FDRCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
     assignStringsToBuckets();
     setupTab();
     return setupFDR(link);
@@ -515,16 +500,15 @@ aligned_unique_ptr<FDR>
 fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
                       const target_t &target, const Grey &grey, u32 hint,
                       hwlmStreamingControl *stream_control) {
-    pair<u8 *, size_t> link(nullptr, 0);
+    pair<aligned_unique_ptr<u8>, size_t> link(nullptr, 0);
     if (stream_control) {
-        link = fdrBuildTableStreaming(lits, stream_control);
+        link = fdrBuildTableStreaming(lits, *stream_control);
     }
 
     DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
 
     if (grey.fdrAllowTeddy) {
-        aligned_unique_ptr<FDR> fdr
-            = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, link);
         if (fdr) {
             DEBUG_PRINTF("build with teddy succeeded\n");
             return fdr;
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index d98bb518..48e2ed6f 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,7 @@
 
 #include "ue2common.h"
 #include "hwlm/hwlm_literal.h"
+#include "util/alloc.h"
 
 #include <map>
 #include <utility>
@@ -44,7 +45,6 @@ namespace ue2 {
 // a pile of decorative typedefs
 // good for documentation purposes more than anything else
 typedef u32 LiteralIndex;
-typedef u32 ConfirmIndex;
 typedef u32 SuffixPositionInString; // zero is last byte, counting back
                                     // into the string
 typedef u32 BucketIndex;
@@ -56,25 +56,22 @@ class EngineDescription;
 class FDREngineDescription;
 struct hwlmStreamingControl;
 
-size_t getFDRConfirm(const std::vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
-                     bool make_small);
-
-std::pair<u8 *, size_t> setupFullMultiConfs(
+std::pair<aligned_unique_ptr<u8>, size_t> setupFullMultiConfs(
     const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
-    std::map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits,
+    std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
     bool make_small);
 
 // all suffixes include an implicit max_bucket_width suffix to ensure that
 // we always read a full-scale flood "behind" us in terms of what's in our
 // state; if we don't have a flood that's long enough we won't be in the
 // right state yet to allow blindly advancing
-std::pair<u8 *, size_t>
+std::pair<aligned_unique_ptr<u8>, size_t>
 setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
                      const EngineDescription &eng);
 
-std::pair<u8 *, size_t>
+std::pair<aligned_unique_ptr<u8>, size_t>
 fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
-                       hwlmStreamingControl *stream_control);
+                       hwlmStreamingControl &stream_control);
 
 static constexpr u32 HINT_INVALID = 0xffffffff;
 
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 08946a5f..23437fe2 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -45,9 +45,10 @@ using namespace std;
 
 namespace ue2 {
 
-typedef u8 ConfSplitType;
-typedef pair<BucketIndex, ConfSplitType> BucketSplitPair;
-typedef map<BucketSplitPair, pair<FDRConfirm *, size_t> > BC2CONF;
+using ConfSplitType = u8;
+using BucketSplitPair = pair<BucketIndex, ConfSplitType>;
+using BC2CONF = map<BucketSplitPair,
+                    pair<aligned_unique_ptr<FDRConfirm>, size_t>>;
 
 // return the number of bytes beyond a length threshold in all strings in lits
 static
@@ -149,9 +150,9 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
 
 //#define FDR_CONFIRM_DUMP 1
 
-static
-size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
-                     bool applyOneCharOpt, bool make_small, bool make_confirm) {
+static pair<aligned_unique_ptr<FDRConfirm>, size_t>
+getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
+              bool make_small, bool make_confirm) {
     vector<LitInfo> tmpLitInfo(lits.size());
     CONF_TYPE andmsk;
     fillLitInfo(lits, tmpLitInfo, andmsk);
@@ -220,55 +221,61 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
 #ifdef FDR_CONFIRM_DUMP
     // print out the literals reversed - makes it easier to line up analyses
     // that are end-offset based
-    for (map<u32, vector<LiteralIndex> >::iterator i = res2lits.begin(),
-         e = res2lits.end(); i != e; ++i) {
-        u32 hash = i->first;
-        vector<LiteralIndex> & vlidx = i->second;
-        if (vlidx.size() > 1) {
-            printf("%x -> %zu literals\n", hash, vlidx.size());
-            u32 min_len = lits[vlidx.front()].s.size();
-            vector<set<u8> > vsl; // contains the set of chars at each location
-                                  // reversed from the end
-            vsl.resize(1024);
-            u32 total_string_size = 0;
-            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
-                 e2 = vlidx.end(); i2 != e2; ++i2) {
-                LiteralIndex litIdx = *i2;
-                total_string_size += lits[litIdx].s.size();
-                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
-                    vsl[lits[litIdx].s.size()-j].insert(lits[litIdx].s.c_str()[j - 1]);
-                }
-                min_len = MIN(min_len, lits[litIdx].s.size());
+    for (const auto &m : res2lits) {
+        const u32 &hash = m.first;
+        const vector<LiteralIndex> &vlidx = m.second;
+        if (vlidx.size() <= 1) {
+            continue;
+        }
+        printf("%x -> %zu literals\n", hash, vlidx.size());
+        size_t min_len = lits[vlidx.front()].s.size();
+
+        vector<set<u8>> vsl; // contains the set of chars at each location
+                             // reversed from the end
+
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            if (lit.s.size() > vsl.size()) {
+                vsl.resize(lit.s.size());
             }
-            printf("common     ");
-            for (u32 j = 0; j < min_len; j++) {
-                if (vsl[j].size() == 1) {
-                    printf("%02x", (u32)*vsl[j].begin());
-                } else {
+            for (size_t j = lit.s.size(); j != 0; j--) {
+                vsl[lit.s.size() - j].insert(lit.s[j - 1]);
+            }
+            min_len = min(min_len, lit.s.size());
+        }
+        printf("common     ");
+        for (size_t j = 0; j < min_len; j++) {
+            if (vsl[j].size() == 1) {
+                printf("%02x", *vsl[j].begin());
+            } else {
+                printf("__");
+            }
+        }
+        printf("\n");
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            printf("%8x  %c", lit.id, lit.nocase ? '!' : ' ');
+            for (size_t j = lit.s.size(); j != 0; j--) {
+                size_t dist_from_end = lit.s.size() - j;
+                if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
                     printf("__");
+                } else {
+                    printf("%02x", lit.s[j - 1]);
                 }
             }
             printf("\n");
-            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
-                 e2 = vlidx.end(); i2 != e2; ++i2) {
-                LiteralIndex litIdx = *i2;
-                printf("%8x  %c", lits[litIdx].id, lits[litIdx].nocase ? '!' : ' ');
-                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
-                    u32 dist_from_end = lits[litIdx].s.size() - j;
-                    if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
-                        printf("__");
-                    } else {
-                        printf("%02x", (u32)lits[litIdx].s.c_str()[j-1]);
-                    }
-                }
-                printf("\n");
-            }
-            u32 total_compares = 0;
-            for (u32 j = 0; j < 1024; j++) { // naughty
-                total_compares += vsl[j].size();
-            }
-            printf("Total compare load: %d Total string size: %d\n\n", total_compares, total_string_size);
         }
+        size_t total_compares = 0;
+        for (const auto &v : vsl) {
+            total_compares += v.size();
+        }
+        size_t total_string_size = 0;
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            total_string_size += lit.s.size();
+        }
+        printf("Total compare load: %zu Total string size: %zu\n\n",
+               total_compares, total_string_size);
     }
 #endif
 
@@ -281,7 +288,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
                   sizeof(LitInfo) * lits.size() + totalLitSize;
     size = ROUNDUP_N(size, alignof(FDRConfirm));
 
-    FDRConfirm *fdrc = (FDRConfirm *)aligned_zmalloc(size);
+    auto fdrc = aligned_zmalloc_unique<FDRConfirm>(size);
     assert(fdrc); // otherwise would have thrown std::bad_alloc
 
     fdrc->andmsk = andmsk;
@@ -295,7 +302,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
     fdrc->groups = gm;
 
     // After the FDRConfirm, we have the lit index array.
-    u8 *fdrc_base = (u8 *)fdrc;
+    u8 *fdrc_base = (u8 *)fdrc.get();
     u8 *ptr = fdrc_base + sizeof(*fdrc);
     ptr = ROUNDUP_PTR(ptr, alignof(u32));
     u32 *bitsToLitIndex = (u32 *)ptr;
@@ -307,14 +314,12 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
 
     // Walk the map by hash value assigning indexes and laying out the
     // elements (and their associated string confirm material) in memory.
-    for (std::map<u32, vector<LiteralIndex> >::const_iterator
-             i = res2lits.begin(), e = res2lits.end(); i != e; ++i) {
-        const u32 hash = i->first;
-        const vector<LiteralIndex> &vlidx = i->second;
-        bitsToLitIndex[hash] = verify_u32(ptr - (u8 *)fdrc);
-        for (vector<LiteralIndex>::const_iterator i2 = vlidx.begin(),
-             e2 = vlidx.end(); i2 != e2; ++i2) {
-            LiteralIndex litIdx = *i2;
+    for (const auto &m : res2lits) {
+        const u32 hash = m.first;
+        const vector<LiteralIndex> &vlidx = m.second;
+        bitsToLitIndex[hash] = verify_u32(ptr - fdrc_base);
+        for (auto i = vlidx.begin(), e = vlidx.end(); i != e; ++i) {
+            LiteralIndex litIdx = *i;
 
             // Write LitInfo header.
             u8 *oldPtr = ptr;
@@ -333,7 +338,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
             }
 
             ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
-            if (i2 + 1 == e2) {
+            if (next(i) == e) {
                 finalLI.next = 0x0;
             } else {
                 // our next field represents an adjustment on top of
@@ -348,14 +353,13 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
         assert((size_t)(ptr - fdrc_base) <= size);
     }
 
-    *fdrc_p = fdrc;
-
     // Return actual used size, not worst-case size. Must be rounded up to
     // FDRConfirm alignment so that the caller can lay out a sequence of these.
     size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
                                    alignof(FDRConfirm));
     assert(actual_size <= size);
-    return actual_size;
+
+    return {move(fdrc), actual_size};
 }
 
 static
@@ -377,12 +381,9 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
     u32 totalConfirmSize = 0;
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
         if (!bucketToLits[b].empty()) {
-            vector<vector<hwlmLiteral> > vl(eng.getConfirmTopLevelSplit());
-            for (vector<LiteralIndex>::const_iterator
-                     i = bucketToLits[b].begin(),
-                     e = bucketToLits[b].end();
-                 i != e; ++i) {
-                hwlmLiteral lit = lits[*i]; // copy
+            vector<vector<hwlmLiteral>> vl(eng.getConfirmTopLevelSplit());
+            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+                hwlmLiteral lit = lits[lit_idx]; // copy
                 // c is last char of this literal
                 u8 c = *(lit.s.rbegin());
 
@@ -424,26 +425,27 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
             }
 
             for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
-                if (!vl[c].empty()) {
-                    DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
-                    FDRConfirm *fdrc;
-                    size_t size = getFDRConfirm(vl[c], &fdrc,
-                                                eng.typicallyHoldsOneCharLits(),
-                                                make_small, makeConfirm);
-                    BucketSplitPair p = make_pair(b, c);
-                    bc2Conf[p] = make_pair(fdrc, size);
-                    totalConfirmSize += size;
+                if (vl[c].empty()) {
+                    continue;
                 }
+                DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
+                auto key = make_pair(b, c);
+                auto fc = getFDRConfirm(vl[c], eng.typicallyHoldsOneCharLits(),
+                                        make_small, makeConfirm);
+                totalConfirmSize += fc.second;
+                assert(bc2Conf.find(key) == end(bc2Conf));
+                bc2Conf.emplace(key, move(fc));
             }
         }
     }
     return totalConfirmSize;
 }
 
-pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
-        const EngineDescription &eng,
-        map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
-        bool make_small) {
+pair<aligned_unique_ptr<u8>, size_t>
+setupFullMultiConfs(const vector<hwlmLiteral> &lits,
+                    const EngineDescription &eng,
+                    map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+                    bool make_small) {
     BC2CONF bc2Conf;
     u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
                                               make_small);
@@ -453,26 +455,24 @@ pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
     u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
     u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
 
-    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    auto buf = aligned_zmalloc_unique<u8>(totalSize);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
-    u32 *confBase = (u32 *)buf;
-    u8 *ptr = buf + totalConfSwitchSize;
+    u32 *confBase = (u32 *)buf.get();
+    u8 *ptr = buf.get() + totalConfSwitchSize;
 
-    for (BC2CONF::const_iterator i = bc2Conf.begin(), e = bc2Conf.end(); i != e;
-         ++i) {
-        const pair<FDRConfirm *, size_t> &p = i->second;
+    for (const auto &m : bc2Conf) {
+        const BucketIndex &b = m.first.first;
+        const u8 &c = m.first.second;
+        const pair<aligned_unique_ptr<FDRConfirm>, size_t> &p = m.second;
         // confirm offset is relative to the base of this structure, now
-        u32 confirm_offset = verify_u32(ptr - (u8 *)buf);
-        memcpy(ptr, p.first, p.second);
+        u32 confirm_offset = verify_u32(ptr - buf.get());
+        memcpy(ptr, p.first.get(), p.second);
         ptr += p.second;
-        aligned_free(p.first);
-        BucketIndex b = i->first.first;
-        u8 c = i->first.second;
         u32 idx = c * nBuckets + b;
         confBase[idx] = confirm_offset;
     }
-    return make_pair(buf, totalSize);
+    return {move(buf), totalSize};
 }
 
 } // namespace ue2
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index cde13f6c..6272b69e 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -105,7 +105,6 @@ struct FDR_Runtime_Args {
     size_t start_offset;
     HWLMCallback cb;
     void *ctxt;
-    hwlm_group_t *groups;
     const u8 *firstFloodDetect;
     const u64a histBytes;
 };
diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index 34536eec..b2e1656c 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -94,14 +94,13 @@ static
 bool setupLongLits(const vector<hwlmLiteral> &lits,
                    vector<hwlmLiteral> &long_lits, size_t max_len) {
     long_lits.reserve(lits.size());
-    for (vector<hwlmLiteral>::const_iterator it = lits.begin();
-         it != lits.end(); ++it) {
-        if (it->s.length() > max_len) {
-            hwlmLiteral tmp = *it; // copy
-            tmp.s.erase(tmp.s.size() - 1, 1); // erase last char
+    for (const auto &lit : lits) {
+        if (lit.s.length() > max_len) {
+            hwlmLiteral tmp = lit; // copy
+            tmp.s.pop_back();
             tmp.id = 0; // recalc later
             tmp.groups = 0; // filled in later by hash bucket(s)
-            long_lits.push_back(tmp);
+            long_lits.push_back(move(tmp));
         }
     }
 
@@ -112,15 +111,12 @@ bool setupLongLits(const vector<hwlmLiteral> &lits,
     // sort long_literals by caseful/caseless and in lexicographical order,
     // remove duplicates
     stable_sort(long_lits.begin(), long_lits.end(), LongLitOrder());
-    vector<hwlmLiteral>::iterator new_end =
-        unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
+    auto new_end = unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
     long_lits.erase(new_end, long_lits.end());
 
     // fill in ids; not currently used
-    for (vector<hwlmLiteral>::iterator i = long_lits.begin(),
-                                       e = long_lits.end();
-         i != e; ++i) {
-        i->id = i - long_lits.begin();
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
+        i->id = distance(long_lits.begin(), i);
     }
     return true;
 }
@@ -143,23 +139,19 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
         hashedPositions[m] = 0;
     }
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
         if (i->nocase) {
-            boundaries[CASEFUL] = verify_u32(i - long_lits.begin());
+            boundaries[CASEFUL] = verify_u32(distance(long_lits.begin(), i));
             break;
         }
     }
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
-        MODES m = i->nocase ? CASELESS : CASEFUL;
-        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+    for (const auto &lit : long_lits) {
+        Modes m = lit.nocase ? CASELESS : CASEFUL;
+        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
             hashedPositions[m]++;
         }
-        positions[m] += i->s.size();
+        positions[m] += lit.s.size();
     }
 
     for (u32 m = CASEFUL; m < MAX_MODES; m++) {
@@ -170,7 +162,7 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
 
 #ifdef DEBUG_COMPILE
     printf("analyzeLits:\n");
-    for (MODES m = CASEFUL; m < MAX_MODES; m++) {
+    for (Modes m = CASEFUL; m < MAX_MODES; m++) {
         printf("mode %s boundary %d positions %d hashedPositions %d "
                "hashEntries %d\n",
                (m == CASEFUL) ? "caseful" : "caseless", boundaries[m],
@@ -181,7 +173,7 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
 }
 
 static
-u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, MODES m) {
+u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, Modes m) {
     return streaming_hash((const u8 *)l.s.c_str() + offset, max_len, m);
 }
 
@@ -203,24 +195,21 @@ struct OffsetIDFromEndOrder {
 
 static
 void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
-                FDRSHashEntry *tab, size_t numEntries, MODES m,
+                FDRSHashEntry *tab, size_t numEntries, Modes mode,
                 map<u32, u32> &litToOffsetVal) {
     const u32 nbits = lg2(numEntries);
     map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
     map<u32, u64a> bucketToBitfield;
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
-        const hwlmLiteral &l = *i;
-        if ((m == CASELESS) != i->nocase) {
+    for (const auto &lit : long_lits) {
+        if ((mode == CASELESS) != lit.nocase) {
             continue;
         }
-        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
-            u32 h = hashLit(l, j, max_len, m);
+        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
+            u32 h = hashLit(lit, j, max_len, mode);
             u32 h_ent = h & ((1U << nbits) - 1);
             u32 h_low = (h >> nbits) & 63;
-            bucketToLitOffPairs[h_ent].push_back(make_pair(i->id, j));
+            bucketToLitOffPairs[h_ent].emplace_back(lit.id, j);
             bucketToBitfield[h_ent] |= (1ULL << h_low);
         }
     }
@@ -231,11 +220,9 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
 
     // sweep out bitfield entries and save the results swapped accordingly
     // also, anything with bitfield entries is put in filledBuckets
-    for (map<u32, u64a>::const_iterator i = bucketToBitfield.begin(),
-                                        e = bucketToBitfield.end();
-         i != e; ++i) {
-        u32 bucket = i->first;
-        u64a contents = i->second;
+    for (const auto &m : bucketToBitfield) {
+        const u32 &bucket = m.first;
+        const u64a &contents = m.second;
         tab[bucket].bitfield = contents;
         filledBuckets.set(bucket);
     }
@@ -243,12 +230,9 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
     // store out all our chains based on free values in our hash table.
     // find nearest free locations that are empty (there will always be more
     // entries than strings, at present)
-    for (map<u32, deque<pair<u32, u32> > >::iterator
-             i = bucketToLitOffPairs.begin(),
-             e = bucketToLitOffPairs.end();
-         i != e; ++i) {
-        u32 bucket = i->first;
-        deque<pair<u32, u32> > &d = i->second;
+    for (auto &m : bucketToLitOffPairs) {
+        u32 bucket = m.first;
+        deque<pair<u32, u32>> &d = m.second;
 
         // sort d by distance of the residual string (len minus our depth into
         // the string). We need to put the 'furthest back' string first...
@@ -299,31 +283,30 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
 static
 size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
     size_t rv = 0;
-    vector<hwlmLiteral>::const_iterator it, ite;
-    for (it = lits.begin(), ite = lits.end(); it != ite; ++it) {
-        rv = max(rv, it->msk.size());
+    for (const auto &lit : lits) {
+        rv = max(rv, lit.msk.size());
     }
     return rv;
 }
 
-pair<u8 *, size_t>
+pair<aligned_unique_ptr<u8>, size_t>
 fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
-                       hwlmStreamingControl *stream_control) {
+                       hwlmStreamingControl &stream_control) {
     // refuse to compile if we are forced to have smaller than minimum
     // history required for long-literal support, full stop
     // otherwise, choose the maximum of the preferred history quantity
     // (currently a fairly extravagant 32) or the already used history
-    // quantity - subject to the limitation of stream_control->history_max
+    // quantity - subject to the limitation of stream_control.history_max
 
     const size_t MIN_HISTORY_REQUIRED = 32;
 
-    if (MIN_HISTORY_REQUIRED > stream_control->history_max) {
+    if (MIN_HISTORY_REQUIRED > stream_control.history_max) {
         throw std::logic_error("Cannot set history to minimum history required");
     }
 
     size_t max_len =
-        MIN(stream_control->history_max,
-            MAX(MIN_HISTORY_REQUIRED, stream_control->history_min));
+        MIN(stream_control.history_max,
+            MAX(MIN_HISTORY_REQUIRED, stream_control.history_min));
     assert(max_len >= MIN_HISTORY_REQUIRED);
     size_t max_mask_len = maxMaskLen(lits);
 
@@ -334,10 +317,10 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
 
         // we want enough history to manage the longest literal and the longest
         // mask.
-        stream_control->literal_history_required =
+        stream_control.literal_history_required =
                     max(maxLen(lits), max_mask_len) - 1;
-        stream_control->literal_stream_state_required = 0;
-        return make_pair(nullptr, size_t{0});
+        stream_control.literal_stream_state_required = 0;
+        return {nullptr, size_t{0}};
     }
 
     // Ensure that we have enough room for the longest mask.
@@ -381,11 +364,11 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     streamBits[CASELESS] = lg2(roundUpToPowerOfTwo(positions[CASELESS] + 2));
     u32 tot_state_bytes = (streamBits[CASEFUL] + streamBits[CASELESS] + 7) / 8;
 
-    u8 * secondaryTable = (u8 *)aligned_zmalloc(tabSize);
+    auto secondaryTable = aligned_zmalloc_unique<u8>(tabSize);
     assert(secondaryTable); // otherwise would have thrown std::bad_alloc
 
     // then fill it in
-    u8 * ptr = secondaryTable;
+    u8 * ptr = secondaryTable.get();
     FDRSTableHeader * header = (FDRSTableHeader *)ptr;
     // fill in header
     header->pseudoEngineID = (u32)0xffffffff;
@@ -407,11 +390,9 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     ptr += litTabSize;
 
     map<u32, u32> litToOffsetVal;
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
         u32 entry = verify_u32(i - long_lits.begin());
-        u32 offset = verify_u32(ptr - secondaryTable);
+        u32 offset = verify_u32(ptr - secondaryTable.get());
 
         // point the table entry to the string location
         litTabPtr[entry].offset = offset;
@@ -425,20 +406,20 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     }
 
     // fill in final lit table entry with current ptr (serves as end value)
-    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable);
+    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable.get());
 
     // fill hash tables
-    ptr = secondaryTable + htOffset[CASEFUL];
+    ptr = secondaryTable.get() + htOffset[CASEFUL];
     for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
         fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
-                   (MODES)m, litToOffsetVal);
+                   (Modes)m, litToOffsetVal);
         ptr += htSize[m];
     }
 
     // tell the world what we did
-    stream_control->literal_history_required = max_len;
-    stream_control->literal_stream_state_required = tot_state_bytes;
-    return make_pair(secondaryTable, tabSize);
+    stream_control.literal_history_required = max_len;
+    stream_control.literal_stream_state_required = tot_state_bytes;
+    return {move(secondaryTable), tabSize};
 }
 
 } // namespace ue2
diff --git a/src/fdr/fdr_streaming_internal.h b/src/fdr/fdr_streaming_internal.h
index 26602ce1..11b07b56 100644
--- a/src/fdr/fdr_streaming_internal.h
+++ b/src/fdr/fdr_streaming_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,11 +41,11 @@
 // hash table (caseful) (FDRSHashEntry)
 // hash table (caseless) (FDRSHashEntry)
 
-typedef enum {
+enum Modes {
     CASEFUL = 0,
     CASELESS = 1,
     MAX_MODES = 2
-} MODES;
+};
 
 // We have one of these structures hanging off the 'link' of our secondary
 // FDR table that handles streaming strings
@@ -91,12 +91,12 @@ struct FDRSHashEntry {
 };
 
 static really_inline
-u32 get_start_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+u32 get_start_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
     return m == CASEFUL ? 0 : h->boundary[m-1];
 }
 
 static really_inline
-u32 get_end_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+u32 get_end_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
     return h->boundary[m];
 }
 
@@ -107,17 +107,17 @@ const struct FDRSLiteral * getLitTab(const struct FDRSTableHeader * h) {
 }
 
 static really_inline
-u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, MODES m) {
+u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, enum Modes m) {
     return getLitTab(h)[get_start_lit_idx(h, m)].offset;
 }
 
 static really_inline
-u32 packStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+u32 packStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
     return v - getBaseOffsetOfLits(h, m) + 1;
 }
 
 static really_inline
-u32 unpackStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+u32 unpackStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
     return v + getBaseOffsetOfLits(h, m) - 1;
 }
 
@@ -127,7 +127,7 @@ u32 has_bit(const struct FDRSHashEntry * ent, u32 bit) {
 }
 
 static really_inline
-u32 streaming_hash(const u8 *ptr, UNUSED size_t len, MODES mode) {
+u32 streaming_hash(const u8 *ptr, UNUSED size_t len, enum Modes mode) {
     const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
     const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
     assert(len >= 32);
diff --git a/src/fdr/fdr_streaming_runtime.h b/src/fdr/fdr_streaming_runtime.h
index fa5843c5..8e264c76 100644
--- a/src/fdr/fdr_streaming_runtime.h
+++ b/src/fdr/fdr_streaming_runtime.h
@@ -143,7 +143,7 @@ u32 fdrStreamStateActive(const struct FDR * fdr, const u8 * stream_state) {
 // binary search for the literal index that contains the current state
 static really_inline
 u32 findLitTabEntry(const struct FDRSTableHeader * streamingTable,
-                    u32 stateValue, MODES m) {
+                    u32 stateValue, enum Modes m) {
     const struct FDRSLiteral * litTab = getLitTab(streamingTable);
     u32 lo = get_start_lit_idx(streamingTable, m);
     u32 hi = get_end_lit_idx(streamingTable, m);
@@ -175,7 +175,7 @@ void fdrUnpackStateMode(struct FDR_Runtime_Args *a,
                         const struct FDRSTableHeader *streamingTable,
                         const struct FDRSLiteral * litTab,
                         const u32 *state_table,
-                        const MODES m) {
+                        const enum Modes m) {
     if (!state_table[m]) {
         return;
     }
@@ -213,8 +213,9 @@ void fdrUnpackState(const struct FDR * fdr, struct FDR_Runtime_Args * a,
 }
 
 static really_inline
-u32 do_single_confirm(const struct FDRSTableHeader * streamingTable,
-                      const struct FDR_Runtime_Args * a, u32 hashState, MODES m) {
+u32 do_single_confirm(const struct FDRSTableHeader *streamingTable,
+                      const struct FDR_Runtime_Args *a, u32 hashState,
+                      enum Modes m) {
     const struct FDRSLiteral * litTab = getLitTab(streamingTable);
     u32 idx = findLitTabEntry(streamingTable, hashState, m);
     size_t found_offset = litTab[idx].offset;
@@ -279,7 +280,7 @@ void fdrFindStreamingHash(const struct FDR_Runtime_Args *a,
 
 static really_inline
 const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
-                                   u32 h, const MODES m) {
+                                   u32 h, const enum Modes m) {
     u32 nbits = streamingTable->hashNBits[m];
     if (!nbits) {
         return NULL;
@@ -303,7 +304,7 @@ const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
 static really_inline
 void fdrPackStateMode(u32 *state_table, const struct FDR_Runtime_Args *a,
                       const struct FDRSTableHeader *streamingTable,
-                      const struct FDRSHashEntry *ent, const MODES m) {
+                      const struct FDRSHashEntry *ent, const enum Modes m) {
     assert(ent);
     assert(streamingTable->hashNBits[m]);
 
diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index 2c131788..62693c30 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -69,7 +69,7 @@ static
 void updateFloodSuffix(vector<FDRFlood> &tmpFlood, u8 c, u32 suffix) {
     FDRFlood &fl = tmpFlood[c];
     fl.suffix = MAX(fl.suffix, suffix + 1);
-    DEBUG_PRINTF("Updated Flood Suffix for char '%c' to %u\n", c, fl.suffix);
+    DEBUG_PRINTF("Updated Flood Suffix for char 0x%02x to %u\n", c, fl.suffix);
 }
 
 static
@@ -90,8 +90,9 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
    }
 }
 
-pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
-                                        const EngineDescription &eng) {
+pair<aligned_unique_ptr<u8>, size_t>
+setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                     const EngineDescription &eng) {
     vector<FDRFlood> tmpFlood(N_CHARS);
     u32 default_suffix = eng.getDefaultFloodSuffixLength();
 
@@ -124,8 +125,9 @@ pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
         for (u32 i = 0; i < iEnd; i++) {
             if (i < litSize) {
                 if (isDifferent(c, lit.s[litSize - i - 1], lit.nocase)) {
-                    DEBUG_PRINTF("non-flood char in literal[%u] %c != %c\n",
-                                                i, c, lit.s[litSize - i - 1]);
+                    DEBUG_PRINTF("non-flood char in literal[%u]: "
+                                 "0x%02x != 0x%02x\n",
+                                 i, c, lit.s[litSize - i - 1]);
                     upSuffix = MIN(upSuffix, i);
                     loSuffix = MIN(loSuffix, i); // makes sense only for case-less
                     break;
@@ -195,11 +197,12 @@ pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     size_t floodHeaderSize = sizeof(u32) * N_CHARS;
     size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
     size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);
-    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+
+    auto buf = aligned_zmalloc_unique<u8>(totalSize);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
-    u32 *floodHeader = (u32 *)buf;
-    FDRFlood *layoutFlood = (FDRFlood * )(buf + floodHeaderSize);
+    u32 *floodHeader = (u32 *)buf.get();
+    FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
 
     u32 currentFloodIndex = 0;
     for (const auto &m : flood2chars) {
@@ -215,7 +218,7 @@ pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
                  floodHeaderSize, floodStructSize, totalSize);
 
-    return make_pair((u8 *)buf, totalSize);
+    return {move(buf), totalSize};
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 08b761c0..e7a0fccd 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -36,7 +36,6 @@
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
     {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -80,15 +79,15 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
 do {                                                                        \
     if (unlikely(isnonzero128(var))) {                                      \
         u64a lo = movq(var);                                                \
-        u64a hi = movq(byteShiftRight128(var, 8));                          \
+        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
         if (unlikely(lo)) {                                                 \
             conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(hi)) {                                                 \
             conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -98,27 +97,27 @@ do {                                                                        \
 do {                                                                        \
     if (unlikely(isnonzero128(var))) {                                      \
         u32 part1 = movd(var);                                              \
-        u32 part2 = movd(byteShiftRight128(var, 4));                        \
-        u32 part3 = movd(byteShiftRight128(var, 8));                        \
-        u32 part4 = movd(byteShiftRight128(var, 12));                       \
+        u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
+        u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
+        u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -126,36 +125,34 @@ do {                                                                        \
 #endif
 
 static really_inline
-m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
-    return and128(and128(pshufb(maskBase[0*2], lo),
-                         pshufb(maskBase[0*2+1], hi)), p_mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    return and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi));
 }
 
 static really_inline
-m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
-                        m128 val) {
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
-    m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m1(maskBase, val);
 
     m128 res_1 = and128(pshufb(maskBase[1*2], lo),
                         pshufb(maskBase[1*2+1], hi));
     m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
-    return and128(and128(r, p_mask), res_shifted_1);
+    return and128(r, res_shifted_1);
 }
 
 static really_inline
 m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 p_mask, m128 val) {
+                        m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
-    m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
 
     m128 res_2 = and128(pshufb(maskBase[2*2], lo),
                         pshufb(maskBase[2*2+1], hi));
@@ -166,11 +163,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
 
 static really_inline
 m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 *old_3, m128 p_mask, m128 val) {
+                        m128 *old_3, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
-    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
 
     m128 res_3 = and128(pshufb(maskBase[3*2], lo),
                         pshufb(maskBase[3*2+1], hi));
@@ -180,11 +177,10 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
 }
 
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -203,13 +199,14 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
@@ -217,9 +214,9 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
     }
 
@@ -227,19 +224,19 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -258,13 +255,14 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -272,9 +270,9 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -282,19 +280,19 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -314,14 +312,14 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -329,11 +327,9 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -341,19 +337,19 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -373,14 +369,14 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -388,11 +384,9 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -400,19 +394,19 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                    a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -434,14 +428,15 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+                                      val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -450,10 +445,10 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr + 16));
+                                      load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -461,20 +456,19 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -496,14 +490,15 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+                                      val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -512,10 +507,10 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr + 16));
+                                      load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -523,20 +518,19 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -559,14 +553,15 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -575,10 +570,10 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr + 16));
+                                      &res_old_3, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -587,19 +582,19 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -622,14 +617,15 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -638,10 +634,10 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr + 16));
+                                      &res_old_3, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -650,9 +646,10 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h
index f3902723..e2936723 100644
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@@ -33,64 +33,85 @@
 #ifndef TEDDY_H_
 #define TEDDY_H_
 
+#include "hwlm/hwlm.h" // for hwlm_group_t
+
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
 
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 #if defined(__AVX2__)
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a);
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                                const struct FDR_Runtime_Args *a);
+hwlm_error_t
+fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+                                   const struct FDR_Runtime_Args *a,
+                                   hwlm_group_t control);
 
 #endif /* __AVX2__ */
 
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 33dd8a30..e4a836d4 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -36,7 +36,6 @@
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #if defined(__AVX2__)
 
@@ -122,22 +121,22 @@ do {                                                                        \
         u64a part4 = extract64from256(r, 1);                                \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -159,41 +158,41 @@ do {                                                                        \
         u32 part8 = extract32from256(r, 3);                                 \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part5)) {                                              \
             conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part6)) {                                              \
             conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part7)) {                                              \
             conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part8)) {                                              \
             conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -205,11 +204,11 @@ do {                                                                        \
     if (unlikely(isnonzero256(var))) {                                      \
         u32 arrCnt = 0;                                                     \
         m128 lo = cast256to128(var);                                        \
-        m128 hi = cast256to128(swap128in256(var));                          \
+        m128 hi = movdq_hi(var);                                            \
         bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
         bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
         for (u32 i = 0; i < arrCnt; i++) {                                  \
-            conf_fn(bitArr[i], confBase, reason, a, ptr, control,           \
+            conf_fn(bitArr[i], confBase, reason, a, ptr, &control,          \
                     &last_match);                                           \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
@@ -372,7 +371,7 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                     64 * (offset);
             *arrCnt += 1;
         }
-        u64a part_1 = movq(byteShiftRight128(var, 8));
+        u64a part_1 = movq(rshiftbyte_m128(var, 8));
         while (unlikely(part_1)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                     64 * (offset + 1);
@@ -385,19 +384,19 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                     32 * (offset * 2);
             *arrCnt += 1;
         }
-        u32 part_1 = movd(byteShiftRight128(var, 4));
+        u32 part_1 = movd(rshiftbyte_m128(var, 4));
         while (unlikely(part_1)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                     32 * (offset * 2 + 1);
             *arrCnt += 1;
         }
-        u32 part_2 = movd(byteShiftRight128(var, 8));
+        u32 part_2 = movd(rshiftbyte_m128(var, 8));
         while (unlikely(part_2)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
                                     32 * (offset * 2 + 2);
             *arrCnt += 1;
         }
-        u32 part_3 = movd(byteShiftRight128(var, 12));
+        u32 part_3 = movd(rshiftbyte_m128(var, 12));
         while (unlikely(part_3)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
                                     32 * (offset * 2 + 3);
@@ -408,36 +407,35 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
 }
 
 static really_inline
-m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
+m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
-    return and256(and256(vpshufb(maskBase[0*2], lo),
-                         vpshufb(maskBase[0*2+1], hi)), p_mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    return and256(vpshufb(maskBase[0*2], lo),
+                  vpshufb(maskBase[0*2+1], hi));
 }
 
 static really_inline
-m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
-                            m256 val) {
+m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m1(maskBase, val);
 
     m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
                         vpshufb(maskBase[1*2+1], hi));
     m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
-    return and256(and256(r, p_mask), res_shifted_1);
+    return and256(r, res_shifted_1);
 }
 
 static really_inline
 m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 p_mask, m256 val) {
+                            m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
 
     m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
                         vpshufb(maskBase[2*2+1], hi));
@@ -448,11 +446,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
 
 static really_inline
 m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 *old_3, m256 p_mask, m256 val) {
+                            m256 *old_3, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
 
     m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
                         vpshufb(maskBase[3*2+1], hi));
@@ -462,12 +460,10 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
 }
 
 static really_inline
-m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
-                             m256 p_mask) {
+m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) {
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
-    m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
-    return and256(res, p_mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
 }
 
 static really_inline
@@ -482,11 +478,10 @@ const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) {
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -505,13 +500,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
@@ -519,10 +515,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
-                                          load2x128(ptr + 16));
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
     }
 
@@ -530,19 +525,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -561,13 +556,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -575,10 +571,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
-                                          load2x128(ptr + 16));
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -586,19 +581,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -618,14 +613,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -633,10 +628,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
                                           load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
@@ -645,19 +639,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -677,25 +671,24 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-         ptr += 16;
+        ptr += 16;
     }
 
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
                                           load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
@@ -704,19 +697,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -738,14 +731,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -754,10 +748,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr + 16));
+                                          load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -766,19 +760,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -800,14 +794,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -816,10 +811,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr + 16));
+                                          load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -828,19 +823,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -863,15 +858,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -880,12 +875,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr + 16));
+                                          &res_old_3, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -894,19 +887,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -929,15 +922,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -946,12 +939,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr + 16));
+                                          &res_old_3, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -960,19 +951,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a) {
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -996,16 +987,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
         ptr += 32;
     }
 
     if (ptr + 32 < buf_end) {
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
         ptr += 32;
     }
@@ -1015,13 +1005,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         CHECK_FLOOD;
 
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
 
         m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
     }
 
@@ -1029,20 +1017,19 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                                const struct FDR_Runtime_Args *a) {
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -1066,16 +1053,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
         ptr += 32;
     }
 
     if (ptr + 32 < buf_end) {
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
         ptr += 32;
     }
@@ -1085,13 +1071,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         CHECK_FLOOD;
 
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
 
         m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
     }
 
@@ -1099,11 +1083,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
     }
-    *a->groups = controlVal;
+
     return HWLM_SUCCESS;
 }
 
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index c1e46d85..15b9665b 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -74,12 +74,11 @@ public:
                   const TeddyEngineDescription &eng_in, bool make_small_in)
         : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
 
-    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
     bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };
 
 class TeddySet {
-    const vector<hwlmLiteral> &lits;
     u32 len;
     // nibbleSets is a series of bitfields over 16 predicates
     // that represent the whether shufti nibble set
@@ -89,8 +88,7 @@ class TeddySet {
     vector<u16> nibbleSets;
     set<u32> litIds;
 public:
-    TeddySet(const vector<hwlmLiteral> &lits_in, u32 len_in)
-        : lits(lits_in), len(len_in), nibbleSets(len_in * 2, 0) {}
+    explicit TeddySet(u32 len_in) : len(len_in), nibbleSets(len_in * 2, 0) {}
     const set<u32> & getLits() const { return litIds; }
     size_t litCount() const { return litIds.size(); }
 
@@ -106,8 +104,8 @@ public:
         }
         printf("\nnlits: %zu\nLit ids: ", litCount());
         printf("Prob: %llu\n", probability());
-        for (set<u32>::iterator i = litIds.begin(), e = litIds.end(); i != e; ++i) {
-            printf("%u ", *i);
+        for (const auto &id : litIds) {
+            printf("%u ", id);
         }
         printf("\n");
         printf("Flood prone : %s\n", isRunProne()?"yes":"no");
@@ -118,15 +116,15 @@ public:
         return nibbleSets == ts.nibbleSets;
     }
 
-    void addLiteral(u32 lit_id) {
-        const string &s = lits[lit_id].s;
+    void addLiteral(u32 lit_id, const hwlmLiteral &lit) {
+        const string &s = lit.s;
         for (u32 i = 0; i < len; i++) {
             if (i < s.size()) {
                 u8 c = s[s.size() - i - 1];
                 u8 c_hi = (c >> 4) & 0xf;
                 u8 c_lo = c & 0xf;
                 nibbleSets[i*2] = 1 << c_lo;
-                if (lits[lit_id].nocase && ourisalpha(c)) {
+                if (lit.nocase && ourisalpha(c)) {
                     nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
                 } else {
                     nibbleSets[i*2+1] =  1 << c_hi;
@@ -185,28 +183,26 @@ bool TeddyCompiler::pack(map<BucketIndex,
     set<TeddySet> sts;
 
     for (u32 i = 0; i < lits.size(); i++) {
-        TeddySet ts(lits, eng.numMasks);
-        ts.addLiteral(i);
+        TeddySet ts(eng.numMasks);
+        ts.addLiteral(i, lits[i]);
         sts.insert(ts);
     }
 
     while (1) {
 #ifdef TEDDY_DEBUG
         printf("Size %zu\n", sts.size());
-        for (set<TeddySet>::const_iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
-            printf("\n"); i1->dump();
+        for (const TeddySet &ts : sts) {
+            printf("\n"); ts.dump();
         }
         printf("\n===============================================\n");
 #endif
 
-        set<TeddySet>::iterator m1 = sts.end(), m2 = sts.end();
+        auto m1 = sts.end(), m2 = sts.end();
         u64a best = 0xffffffffffffffffULL;
 
-        for (set<TeddySet>::iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
-            set<TeddySet>::iterator i2 = i1;
-            ++i2;
+        for (auto i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
             const TeddySet &s1 = *i1;
-            for (set<TeddySet>::iterator e2 = sts.end(); i2 != e2; ++i2) {
+            for (auto i2 = next(i1), e2 = sts.end(); i2 != e2; ++i2) {
                 const TeddySet &s2 = *i2;
 
                 // be more conservative if we don't absolutely need to
@@ -216,7 +212,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
                     continue;
                 }
 
-                TeddySet tmpSet(lits, eng.numMasks);
+                TeddySet tmpSet(eng.numMasks);
                 tmpSet.merge(s1);
                 tmpSet.merge(s2);
                 u64a newScore = tmpSet.heuristic();
@@ -246,7 +242,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
         }
 
         // do the merge
-        TeddySet nts(lits, eng.numMasks);
+        TeddySet nts(eng.numMasks);
         nts.merge(*m1);
         nts.merge(*m2);
 #ifdef TEDDY_DEBUG
@@ -263,25 +259,23 @@ bool TeddyCompiler::pack(map<BucketIndex,
         sts.erase(m2);
         sts.insert(nts);
     }
-    u32 cnt = 0;
 
     if (sts.size() > eng.getNumBuckets()) {
         return false;
     }
 
-    for (set<TeddySet>::const_iterator i = sts.begin(), e = sts.end(); i != e;
-         ++i) {
-        for (set<u32>::const_iterator i2 = i->getLits().begin(),
-                                      e2 = i->getLits().end();
-             i2 != e2; ++i2) {
-            bucketToLits[cnt].push_back(*i2);
-        }
-        cnt++;
+    u32 bucket_id = 0;
+    for (const TeddySet &ts : sts) {
+        const auto &ts_lits = ts.getLits();
+        auto &bucket_lits = bucketToLits[bucket_id];
+        bucket_lits.insert(end(bucket_lits), begin(ts_lits), end(ts_lits));
+        bucket_id++;
     }
     return true;
 }
 
-aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
     if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
         DEBUG_PRINTF("too many literals: %zu\n", lits.size());
         return nullptr;
@@ -314,9 +308,8 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
 
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
 
-    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
-    pair<u8 *, size_t> confirmTmp
-        = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng);
+    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
 
     size_t size = ROUNDUP_N(sizeof(Teddy) +
                              maskLen +
@@ -334,38 +327,29 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
     teddy->maxStringLen = verify_u32(maxLen(lits));
 
     u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
     ptr += confirmTmp.second;
-    aligned_free(confirmTmp.first);
 
     teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
     ptr += floodControlTmp.second;
-    aligned_free(floodControlTmp.first);
 
     if (link.first) {
         teddy->link = verify_u32(ptr - teddy_base);
-        memcpy(ptr, link.first, link.second);
-        aligned_free(link.first);
+        memcpy(ptr, link.first.get(), link.second);
     } else {
         teddy->link = 0;
     }
 
     u8 *baseMsk = teddy_base + sizeof(Teddy);
 
-    for (map<BucketIndex, std::vector<LiteralIndex> >::const_iterator
-             i = bucketToLits.begin(),
-             e = bucketToLits.end();
-         i != e; ++i) {
-        const u32 bucket_id = i->first;
-        const vector<LiteralIndex> &ids = i->second;
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
         const u8 bmsk = 1U << (bucket_id % 8);
 
-        for (vector<LiteralIndex>::const_iterator i2 = ids.begin(),
-                                                  e2 = ids.end();
-             i2 != e2; ++i2) {
-            LiteralIndex lit_id = *i2;
-            const hwlmLiteral & l = lits[lit_id];
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
             DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
             const u32 sz = verify_u32(l.s.size());
 
@@ -439,10 +423,10 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
 
 } // namespace
 
-aligned_unique_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                              bool make_small, u32 hint,
-                                              const target_t &target,
-                                              pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
+                      u32 hint, const target_t &target,
+                      pair<aligned_unique_ptr<u8>, size_t> &link) {
     unique_ptr<TeddyEngineDescription> des;
     if (hint == HINT_INVALID) {
         des = chooseTeddyEngine(target, lits);
diff --git a/src/fdr/teddy_compile.h b/src/fdr/teddy_compile.h
index fba6a3d1..276c1347 100644
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,7 +49,7 @@ struct hwlmLiteral;
 ue2::aligned_unique_ptr<FDR>
 teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
                       u32 hint, const target_t &target,
-                      std::pair<u8 *, size_t> link);
+                      std::pair<aligned_unique_ptr<u8>, size_t> &link);
 
 } // namespace ue2
 
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index c50b4d16..dc65c70a 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -51,8 +51,7 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 
 #define CHECK_HWLM_TERMINATE_MATCHING                                       \
 do {                                                                        \
-    if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {                  \
-        *a->groups = controlVal;                                            \
+    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
         return HWLM_TERMINATED;                                             \
     }                                                                       \
 } while (0);
@@ -61,8 +60,7 @@ do {                                                                        \
 do {                                                                        \
     if (unlikely(ptr > tryFloodDetect)) {                                   \
         tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
-                                     &floodBackoff, &controlVal,            \
-                                     iterBytes);                            \
+                                     &floodBackoff, &control, iterBytes);   \
         CHECK_HWLM_TERMINATE_MATCHING;                                      \
     }                                                                       \
 } while (0);
diff --git a/src/grey.cpp b/src/grey.cpp
index 69dab627..bad56b56 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
 #include <string>
 #include <vector>
 
-#define DEFAULT_MAX_HISTORY 60
+#define DEFAULT_MAX_HISTORY 110
 
 using namespace std;
 
@@ -50,8 +50,11 @@ Grey::Grey(void) :
                    allowLitHaig(true),
                    allowLbr(true),
                    allowMcClellan(true),
+                   allowSheng(true),
                    allowPuff(true),
+                   allowLiteral(true),
                    allowRose(true),
+                   allowViolet(true),
                    allowExtendedNFA(true), /* bounded repeats of course */
                    allowLimExNFA(true),
                    allowAnchoredAcyclic(true),
@@ -60,6 +63,13 @@ Grey::Grey(void) :
                    allowDecoratedLiteral(true),
                    allowNoodle(true),
                    fdrAllowTeddy(true),
+                   violetAvoidSuffixes(true),
+                   violetAvoidWeakInfixes(true),
+                   violetDoubleCut(true),
+                   violetExtractStrongLiterals(true),
+                   violetLiteralChains(true),
+                   violetDoubleCutLiteralLen(3),
+                   violetEarlyCleanLiteralLen(6),
                    puffImproveHead(true),
                    castleExclusive(true),
                    mergeSEP(true), /* short exhaustible passthroughs */
@@ -81,7 +91,6 @@ Grey::Grey(void) :
                    allowZombies(true),
                    floodAsPuffette(false),
                    nfaForceSize(0),
-                   nfaForceShifts(0),
                    maxHistoryAvailable(DEFAULT_MAX_HISTORY),
                    minHistoryAvailable(0), /* debugging only */
                    maxAnchoredRegion(63), /* for rose's atable to run over */
@@ -119,6 +128,7 @@ Grey::Grey(void) :
                    equivalenceEnable(true),
 
                    allowSmallWrite(true), // McClellan dfas for small patterns
+                   allowSmallWriteSheng(false), // allow use of Sheng for SMWR
 
                    smallWriteLargestBuffer(70), // largest buffer that can be
                                                 // considered a small write
@@ -126,6 +136,10 @@ Grey::Grey(void) :
                                                 // are given to rose &co
                    smallWriteLargestBufferBad(35),
                    limitSmallWriteOutfixSize(1048576), // 1 MB
+                   smallWriteMaxPatterns(10000),
+                   smallWriteMaxLiterals(10000),
+                   allowTamarama(true), // Tamarama engine
+                   tamaChunkSize(100),
                    dumpFlags(0),
                    limitPatternCount(8000000), // 8M patterns
                    limitPatternLength(16000),  // 16K bytes
@@ -202,8 +216,11 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowLitHaig);
         G_UPDATE(allowLbr);
         G_UPDATE(allowMcClellan);
+        G_UPDATE(allowSheng);
         G_UPDATE(allowPuff);
+        G_UPDATE(allowLiteral);
         G_UPDATE(allowRose);
+        G_UPDATE(allowViolet);
         G_UPDATE(allowExtendedNFA);
         G_UPDATE(allowLimExNFA);
         G_UPDATE(allowAnchoredAcyclic);
@@ -212,6 +229,13 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowDecoratedLiteral);
         G_UPDATE(allowNoodle);
         G_UPDATE(fdrAllowTeddy);
+        G_UPDATE(violetAvoidSuffixes);
+        G_UPDATE(violetAvoidWeakInfixes);
+        G_UPDATE(violetDoubleCut);
+        G_UPDATE(violetExtractStrongLiterals);
+        G_UPDATE(violetLiteralChains);
+        G_UPDATE(violetDoubleCutLiteralLen);
+        G_UPDATE(violetEarlyCleanLiteralLen);
         G_UPDATE(puffImproveHead);
         G_UPDATE(castleExclusive);
         G_UPDATE(mergeSEP);
@@ -232,7 +256,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowZombies);
         G_UPDATE(floodAsPuffette);
         G_UPDATE(nfaForceSize);
-        G_UPDATE(nfaForceShifts);
         G_UPDATE(highlanderSquash);
         G_UPDATE(maxHistoryAvailable);
         G_UPDATE(minHistoryAvailable);
@@ -270,9 +293,14 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(miracleHistoryBonus);
         G_UPDATE(equivalenceEnable);
         G_UPDATE(allowSmallWrite);
+        G_UPDATE(allowSmallWriteSheng);
         G_UPDATE(smallWriteLargestBuffer);
         G_UPDATE(smallWriteLargestBufferBad);
         G_UPDATE(limitSmallWriteOutfixSize);
+        G_UPDATE(smallWriteMaxPatterns);
+        G_UPDATE(smallWriteMaxLiterals);
+        G_UPDATE(allowTamarama);
+        G_UPDATE(tamaChunkSize);
         G_UPDATE(limitPatternCount);
         G_UPDATE(limitPatternLength);
         G_UPDATE(limitGraphVertices);
@@ -309,7 +337,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = false;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
@@ -325,7 +355,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = true;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
@@ -341,7 +373,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = true;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
diff --git a/src/grey.h b/src/grey.h
index a2261052..90f5f826 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -50,8 +50,11 @@ struct Grey {
     bool allowLitHaig;
     bool allowLbr;
     bool allowMcClellan;
+    bool allowSheng;
     bool allowPuff;
+    bool allowLiteral;
     bool allowRose;
+    bool allowViolet;
     bool allowExtendedNFA;
     bool allowLimExNFA;
     bool allowAnchoredAcyclic;
@@ -62,6 +65,14 @@ struct Grey {
     bool allowNoodle;
     bool fdrAllowTeddy;
 
+    u32  violetAvoidSuffixes; /* 0=never, 1=sometimes, 2=always */
+    bool violetAvoidWeakInfixes;
+    bool violetDoubleCut;
+    bool violetExtractStrongLiterals;
+    bool violetLiteralChains;
+    u32  violetDoubleCutLiteralLen;
+    u32  violetEarlyCleanLiteralLen;
+
     bool puffImproveHead;
     bool castleExclusive; // enable castle mutual exclusion analysis
 
@@ -88,7 +99,6 @@ struct Grey {
     bool floodAsPuffette;
 
     u32 nfaForceSize;
-    u32 nfaForceShifts;
 
     u32 maxHistoryAvailable;
     u32 minHistoryAvailable;
@@ -140,9 +150,16 @@ struct Grey {
 
     // SmallWrite engine
     bool allowSmallWrite;
+    bool allowSmallWriteSheng;
     u32 smallWriteLargestBuffer;  // largest buffer that can be small write
     u32 smallWriteLargestBufferBad;// largest buffer that can be small write
     u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
+    u32 smallWriteMaxPatterns; // only try small writes if fewer patterns
+    u32 smallWriteMaxLiterals; // only try small writes if fewer literals
+
+    // Tamarama engine
+    bool allowTamarama;
+    u32 tamaChunkSize; //!< max chunk size for exclusivity analysis in Tamarama
 
     enum DumpFlags {
         DUMP_NONE       = 0,
diff --git a/src/hs.cpp b/src/hs.cpp
index 3680e79e..07f6d2c1 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -219,7 +219,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
                                     : get_current_target();
 
     CompileContext cc(isStreaming, isVectored, target_info, g);
-    NG ng(cc, somPrecision);
+    NG ng(cc, elements, somPrecision);
 
     try {
         for (unsigned int i = 0; i < elements; i++) {
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 48168cc2..c5212cbe 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -98,6 +98,12 @@ extern "C"
  *      The library was unable to allocate temporary storage used during
  *      compilation time.
  *
+ *    - *Allocator returned misaligned memory*
+ *
+ *      The memory allocator (either malloc() or the allocator set with @ref
+ *      hs_set_allocator()) did not correctly return memory suitably aligned
+ *      for the largest representable data type on this platform.
+ *
  *    - *Internal error*
  *
  *      An unexpected error occurred: if this error is reported, please contact
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 054f05c4..2e16f1ac 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 #include "fdr/fdr.h"
 #include "nfa/accel.h"
 #include "nfa/shufti.h"
+#include "nfa/truffle.h"
 #include "nfa/vermicelli.h"
 #include <string.h>
 
@@ -64,8 +65,13 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
     case ACCEL_SHUFTI:
         DEBUG_PRINTF("single shufti\n");
         return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
+    case ACCEL_TRUFFLE:
+        DEBUG_PRINTF("truffle\n");
+        return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
     default:
         /* no acceleration, fall through and return current ptr */
+        DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
+        assert(aux->accel_type == ACCEL_NONE);
         return ptr;
     }
 }
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index b3978017..b1814245 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -35,9 +35,11 @@
 #include "hwlm_internal.h"
 #include "noodle_engine.h"
 #include "noodle_build.h"
+#include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
 #include "nfa/shufticompile.h"
+#include "nfa/trufflecompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
 #include "util/charreach.h"
@@ -62,6 +64,28 @@ namespace ue2 {
 static const unsigned int MAX_ACCEL_OFFSET = 16;
 static const unsigned int MAX_SHUFTI_WIDTH = 240;
 
+static
+size_t mask_overhang(const hwlmLiteral &lit) {
+    size_t msk_true_size = lit.msk.size();
+    assert(msk_true_size <= HWLM_MASKLEN);
+    assert(HWLM_MASKLEN <= MAX_ACCEL_OFFSET);
+    for (u8 c : lit.msk) {
+        if (!c) {
+            msk_true_size--;
+        } else {
+            break;
+        }
+    }
+
+    if (lit.s.length() >= msk_true_size) {
+        return 0;
+    }
+
+    /* only short literals should be able to have a mask which overhangs */
+    assert(lit.s.length() < MAX_ACCEL_OFFSET);
+    return msk_true_size - lit.s.length();
+}
+
 static
 bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
     const hwlmLiteral &first = *lits.front();
@@ -167,7 +191,8 @@ bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
                 }
 
                 if (found) {
-                    curr.max_offset = MAX(curr.max_offset, j);
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
                     break;
                 }
             }
@@ -288,8 +313,8 @@ bool findSVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
                 }
 
                 if (found) {
-                    curr.max_offset = MAX(curr.max_offset, j);
-                    break;
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
                 }
             }
         }
@@ -346,6 +371,25 @@ void filterLits(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups,
     }
 }
 
+static
+bool litGuardedByCharReach(const CharReach &cr, const hwlmLiteral &lit,
+                           u32 max_offset) {
+    for (u32 i = 0; i <= max_offset && i < lit.s.length(); i++) {
+         unsigned char c = lit.s[i];
+         if (lit.nocase) {
+             if (cr.test(mytoupper(c)) && cr.test(mytolower(c))) {
+                 return true;
+             }
+         } else {
+             if (cr.test(c)) {
+                 return true;
+             }
+         }
+    }
+
+    return false;
+}
+
 static
 void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
                             hwlm_group_t expected_groups, AccelAux *aux) {
@@ -363,29 +407,45 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         return;
     }
 
+    /* look for shufti/truffle */
+
     vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
     for (const auto &lit : lits) {
         if (!(lit.groups & expected_groups)) {
             continue;
         }
 
-        for (u32 i = 0; i < MAX_ACCEL_OFFSET && i < lit.s.length(); i++) {
-            unsigned char c = lit.s[i];
+        u32 overhang = mask_overhang(lit);
+        for (u32 i = 0; i < overhang; i++) {
+            /* this offset overhangs the start of the real literal; look at the
+             * msk/cmp */
+            for (u32 j = 0; j < N_CHARS; j++) {
+                if ((j & lit.msk[i]) == lit.cmp[i]) {
+                    reach[i].set(j);
+                }
+            }
+        }
+        for (u32 i = overhang; i < MAX_ACCEL_OFFSET; i++) {
+            CharReach &reach_i = reach[i];
+            u32 i_effective = i - overhang;
+
+            if (litGuardedByCharReach(reach_i, lit, i_effective)) {
+                continue;
+            }
+            unsigned char c = i_effective < lit.s.length() ? lit.s[i_effective]
+                                                           : lit.s.back();
             if (lit.nocase) {
-                DEBUG_PRINTF("adding %02hhx to %u\n", mytoupper(c), i);
-                DEBUG_PRINTF("adding %02hhx to %u\n", mytolower(c), i);
-                reach[i].set(mytoupper(c));
-                reach[i].set(mytolower(c));
+                reach_i.set(mytoupper(c));
+                reach_i.set(mytolower(c));
             } else {
-                DEBUG_PRINTF("adding %02hhx to %u\n", c, i);
-                reach[i].set(c);
+                reach_i.set(c);
             }
         }
     }
 
     u32 min_count = ~0U;
     u32 min_offset = ~0U;
-    for (u32 i = 0; i < min_len; i++) {
+    for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
         size_t count = reach[i].count();
         DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
                      describeClass(reach[i]).c_str(), count);
@@ -394,10 +454,9 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
             min_offset = i;
         }
     }
-    assert(min_offset <= min_len);
 
     if (min_count > MAX_SHUFTI_WIDTH) {
-        DEBUG_PRINTF("min shufti with %u chars is too wide\n", min_count);
+        DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);
         return;
     }
 
@@ -410,7 +469,11 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         return;
     }
 
-    DEBUG_PRINTF("fail\n");
+    truffleBuildMasks(cr, &aux->truffle.mask1, &aux->truffle.mask2);
+    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
+                 describeClass(cr).c_str(), cr.count(), min_offset);
+    aux->truffle.accel_type = ACCEL_TRUFFLE;
+    aux->truffle.offset = verify_u8(min_offset);
 }
 
 static
@@ -466,6 +529,10 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
                          stream_control->history_max);
             return false;
         }
+        if (2 * lits.front().s.length() - 2 > FDR_TEMP_BUF_SIZE) {
+            assert(0);
+            return false;
+        }
     }
 
     if (!lits.front().msk.empty()) {
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
index e2f80a59..1d1ab4e6 100644
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -37,7 +37,6 @@
 #include "util/compare.h"
 #include "util/masked_move.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include <ctype.h>
 #include <stdbool.h>
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
index b3673246..40575409 100644
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -115,7 +115,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
         v = and128(v, caseMask);
     }
 
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
 
     // mask out where we can't match
     u32 mask = (0xFFFF >> (16 - l));
@@ -142,7 +143,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
         v = and128(v, caseMask);
     }
 
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
 
     // mask out where we can't match
     u32 buf_off = start - offset;
diff --git a/src/nfa/mcclellancompile_accel.cpp b/src/nfa/accel_dfa_build_strat.cpp
old mode 100644
new mode 100755
similarity index 58%
rename from src/nfa/mcclellancompile_accel.cpp
rename to src/nfa/accel_dfa_build_strat.cpp
index c5325fcc..ba21adc7
--- a/src/nfa/mcclellancompile_accel.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,18 +26,20 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "mcclellancompile_accel.h"
-
-#include "mcclellancompile_util.h"
+#include "accel_dfa_build_strat.h"
 
+#include "accel.h"
 #include "grey.h"
 #include "nfagraph/ng_limex_accel.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/verify_types.h"
 
-#include <vector>
 #include <sstream>
+#include <vector>
 
 #define PATHS_LIMIT 500
 
@@ -46,14 +48,13 @@ using namespace std;
 namespace ue2 {
 
 namespace {
-
 struct path {
     vector<CharReach> reach;
     dstate_id_t dest = DEAD_STATE;
-    explicit path(dstate_id_t base) : dest(base) {}
+    explicit path(dstate_id_t base) : dest(base) {
+    }
+};
 };
-
-}
 
 static UNUSED
 string describeClasses(const vector<CharReach> &v) {
@@ -85,8 +86,8 @@ bool is_useful_path(const vector<path> &good, const path &p) {
                 goto next;
             }
         }
-        DEBUG_PRINTF("better: [%s] -> %u\n",
-                     describeClasses(g.reach).c_str(), g.dest);
+        DEBUG_PRINTF("better: [%s] -> %u\n", describeClasses(g.reach).c_str(),
+                     g.dest);
 
         return false;
     next:;
@@ -106,8 +107,7 @@ path append(const path &orig, const CharReach &cr, u32 new_dest) {
 
 static
 void extend(const raw_dfa &rdfa, const path &p,
-            map<u32, vector<path> > &all,
-            vector<path> &out) {
+            map<u32, vector<path>> &all, vector<path> &out) {
     dstate s = rdfa.states[p.dest];
 
     if (!p.reach.empty() && p.reach.back().none()) {
@@ -147,17 +147,17 @@ void extend(const raw_dfa &rdfa, const path &p,
         }
 
         DEBUG_PRINTF("----good: [%s] -> %u\n",
-                         describeClasses(pp.reach).c_str(), pp.dest);
+                     describeClasses(pp.reach).c_str(), pp.dest);
         all[e.first].push_back(pp);
         out.push_back(pp);
     }
 }
 
 static
-vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
-                                          u32 len) {
-    vector<path> paths{ path(base) };
-    map<u32, vector<path> > all;
+vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
+                                         dstate_id_t base, u32 len) {
+    vector<path> paths{path(base)};
+    map<u32, vector<path>> all;
     all[base].push_back(path(base));
     for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
         vector<path> next_gen;
@@ -170,7 +170,7 @@ vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
 
     dump_paths(paths);
 
-    vector<vector<CharReach> > rv;
+    vector<vector<CharReach>> rv;
     for (auto &p : paths) {
         rv.push_back(move(p.reach));
     }
@@ -181,16 +181,58 @@ static
 AccelScheme look_for_offset_accel(const raw_dfa &rdfa, dstate_id_t base,
                                   u32 max_allowed_accel_offset) {
     DEBUG_PRINTF("looking for accel for %hu\n", base);
-    vector<vector<CharReach> > paths = generate_paths(rdfa, base,
-                                                   max_allowed_accel_offset + 1);
+    vector<vector<CharReach>> paths =
+        generate_paths(rdfa, base, max_allowed_accel_offset + 1);
     AccelScheme as = findBestAccelScheme(paths, CharReach(), true);
     DEBUG_PRINTF("found %s + %u\n", describeClass(as.cr).c_str(), as.offset);
     return as;
 }
 
+static UNUSED
+bool better(const AccelScheme &a, const AccelScheme &b) {
+    if (!a.double_byte.empty() && b.double_byte.empty()) {
+        return true;
+    }
+
+    if (!b.double_byte.empty()) {
+        return false;
+    }
+
+    return a.cr.count() < b.cr.count();
+}
+
+static
+vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
+    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        rv.at(rdfa.alpha_remap[i]).set(i);
+    }
+
+    return rv;
+}
+
+static
+bool double_byte_ok(const AccelScheme &info) {
+    return !info.double_byte.empty() &&
+           info.double_cr.count() < info.double_byte.size() &&
+           info.double_cr.count() <= 2 && !info.double_byte.empty();
+}
+
+static
+bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
+    u16 top_remap = raw.alpha_remap[TOP];
+    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+        if (i != top_remap && raw.states[s].next[i] == s) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static
 vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
-                                 const CharReach &escape) {
+                                        const CharReach &escape) {
     set<u16> rv;
     CharReach nonexit = ~escape;
     for (auto i = nonexit.find_first(); i != CharReach::npos;
@@ -201,9 +243,58 @@ vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
     return vector<u16>(rv.begin(), rv.end());
 }
 
+static
+dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
+    if (raw.start_floating != DEAD_STATE) {
+        DEBUG_PRINTF("has floating start\n");
+        return raw.start_floating;
+    }
+
+    DEBUG_PRINTF("looking for SDS proxy\n");
+
+    dstate_id_t s = raw.start_anchored;
+
+    if (has_self_loop(s, raw)) {
+        return s;
+    }
+
+    u16 top_remap = raw.alpha_remap[TOP];
+
+    ue2::unordered_set<dstate_id_t> seen;
+    while (true) {
+        seen.insert(s);
+        DEBUG_PRINTF("basis %hu\n", s);
+
+        /* check if we are connected to a state with a self loop */
+        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+            dstate_id_t t = raw.states[s].next[i];
+            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
+                return t;
+            }
+        }
+
+        /* find a neighbour to use as a basis for looking for the sds proxy */
+        dstate_id_t t = DEAD_STATE;
+        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+            dstate_id_t tt = raw.states[s].next[i];
+            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
+                t = tt;
+                break;
+            }
+        }
+
+        if (t == DEAD_STATE) {
+            /* we were unable to find a state to use as a SDS proxy */
+            return DEAD_STATE;
+        }
+
+        s = t;
+    }
+}
+
 static
 set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
-                             const AccelScheme &ei) {
+                                    const AccelScheme &ei) {
     DEBUG_PRINTF("looking for region around %hu\n", base);
 
     set<dstate_id_t> region = {base};
@@ -236,98 +327,10 @@ set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
     return region;
 }
 
-static
-bool better(const AccelScheme &a, const AccelScheme &b) {
-    if (!a.double_byte.empty() && b.double_byte.empty()) {
-        return true;
-    }
-
-    if (!b.double_byte.empty()) {
-        return false;
-    }
-
-    return a.cr.count() < b.cr.count();
-}
-
-static
-vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
-    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
-
-    for (u32 i = 0; i < N_CHARS; i++) {
-        rv.at(rdfa.alpha_remap[i]).set(i);
-    }
-
-    return rv;
-}
-
-map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
-                                                   const dfa_build_strat &strat,
-                                                   const Grey &grey) {
-    map<dstate_id_t, AccelScheme> rv;
-    if (!grey.accelerateDFA) {
-        return rv;
-    }
-
-    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
-    DEBUG_PRINTF("sds %hu\n", sds_proxy);
-
-    for (size_t i = 0; i < rdfa.states.size(); i++) {
-        if (i == DEAD_STATE) {
-            continue;
-        }
-
-        /* Note on report acceleration states: While we can't accelerate while we
-         * are spamming out callbacks, the QR code paths don't raise reports
-         * during scanning so they can accelerate report states. */
-        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
-            continue;
-        }
-
-        size_t single_limit = i == sds_proxy ? ACCEL_DFA_MAX_FLOATING_STOP_CHAR
-                                             : ACCEL_DFA_MAX_STOP_CHAR;
-        DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
-
-        AccelScheme ei = strat.find_escape_strings(i);
-        if (ei.cr.count() > single_limit) {
-            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
-                         ei.cr.count());
-            continue;
-        }
-
-        DEBUG_PRINTF("state %zu should be accelerable %zu\n",
-                     i, ei.cr.count());
-
-        rv[i] = ei;
-    }
-
-    /* provide accleration states to states in the region of sds */
-    if (contains(rv, sds_proxy)) {
-        AccelScheme sds_ei = rv[sds_proxy];
-        sds_ei.double_byte.clear(); /* region based on single byte scheme
-                                     * may differ from double byte */
-        DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
-                     sds_ei.cr.count());
-        auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
-        for (auto s : sds_region) {
-            if (!contains(rv, s) || better(sds_ei, rv[s])) {
-                rv[s] = sds_ei;
-            }
-        }
-    }
-
-    return rv;
-}
-
-static
-bool double_byte_ok(const AccelScheme &info) {
-    return !info.double_byte.empty()
-        && info.double_cr.count() < info.double_byte.size()
-        && info.double_cr.count() <= 2 && !info.double_byte.empty();
-}
-
-AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx,
-                                       u32 max_allowed_accel_offset) {
+AccelScheme
+accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
     AccelScheme rv;
+    const raw_dfa &rdfa = get_raw();
     rv.cr.clear();
     rv.offset = 0;
     const dstate &raw = rdfa.states[this_idx];
@@ -354,7 +357,7 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
 
         if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
             DEBUG_PRINTF("leads to report\n");
-            outs2_broken = true;  /* cannot accelerate over reports */
+            outs2_broken = true; /* cannot accelerate over reports */
             continue;
         }
         succs[next_id] |= cr_i;
@@ -402,14 +405,12 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
 
     DEBUG_PRINTF("this %u, sds proxy %hu\n", this_idx, get_sds_or_proxy(rdfa));
     DEBUG_PRINTF("broken %d\n", outs2_broken);
-    if (!double_byte_ok(rv) && !is_triggered(rdfa.kind)
-        && this_idx == rdfa.start_floating
-        && this_idx != DEAD_STATE) {
+    if (!double_byte_ok(rv) && !is_triggered(rdfa.kind) &&
+        this_idx == rdfa.start_floating && this_idx != DEAD_STATE) {
         DEBUG_PRINTF("looking for offset accel at %u\n", this_idx);
-        auto offset = look_for_offset_accel(rdfa, this_idx,
-                                            max_allowed_accel_offset);
-        DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(),
-                      rv.cr.count());
+        auto offset =
+            look_for_offset_accel(rdfa, this_idx, max_allowed_offset_accel());
+        DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(), rv.cr.count());
         if (double_byte_ok(offset) || offset.cr.count() < rv.cr.count()) {
             DEBUG_PRINTF("using offset accel\n");
             rv = offset;
@@ -419,4 +420,172 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
     return rv;
 }
 
+void
+accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
+                                  const AccelScheme &info,
+                                  void *accel_out) {
+    AccelAux *accel = (AccelAux *)accel_out;
+
+    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
+                 info.double_offset);
+    accel->generic.offset = verify_u8(info.offset);
+
+    if (double_byte_ok(info) && info.double_cr.none() &&
+        info.double_byte.size() == 1) {
+        accel->accel_type = ACCEL_DVERM;
+        accel->dverm.c1 = info.double_byte.begin()->first;
+        accel->dverm.c2 = info.double_byte.begin()->second;
+        accel->dverm.offset = verify_u8(info.double_offset);
+        DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
+        return;
+    }
+
+    if (double_byte_ok(info) && info.double_cr.none() &&
+        (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
+        bool ok = true;
+
+        assert(!info.double_byte.empty());
+        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
+        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
+
+        for (const pair<u8, u8> &p : info.double_byte) {
+            if ((p.first & CASE_CLEAR) != firstC ||
+                (p.second & CASE_CLEAR) != secondC) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (ok) {
+            accel->accel_type = ACCEL_DVERM_NOCASE;
+            accel->dverm.c1 = firstC;
+            accel->dverm.c2 = secondC;
+            accel->dverm.offset = verify_u8(info.double_offset);
+            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
+            return;
+        }
+
+        u8 m1;
+        u8 m2;
+        if (buildDvermMask(info.double_byte, &m1, &m2)) {
+            accel->accel_type = ACCEL_DVERM_MASKED;
+            accel->dverm.offset = verify_u8(info.double_offset);
+            accel->dverm.c1 = info.double_byte.begin()->first & m1;
+            accel->dverm.c2 = info.double_byte.begin()->second & m2;
+            accel->dverm.m1 = m1;
+            accel->dverm.m2 = m2;
+            DEBUG_PRINTF(
+                "building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
+                accel->dverm.c1, accel->dverm.c2);
+            return;
+        }
+    }
+
+    if (double_byte_ok(info) &&
+        shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
+                               &accel->dshufti.lo1, &accel->dshufti.hi1,
+                               &accel->dshufti.lo2, &accel->dshufti.hi2)) {
+        accel->accel_type = ACCEL_DSHUFTI;
+        accel->dshufti.offset = verify_u8(info.double_offset);
+        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
+        return;
+    }
+
+    if (info.cr.none()) {
+        accel->accel_type = ACCEL_RED_TAPE;
+        DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
+                     " from which there is no escape\n",
+                     this_idx);
+        return;
+    }
+
+    if (info.cr.count() == 1) {
+        accel->accel_type = ACCEL_VERM;
+        accel->verm.c = info.cr.find_first();
+        DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
+        return;
+    }
+
+    if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
+        accel->accel_type = ACCEL_VERM_NOCASE;
+        accel->verm.c = info.cr.find_first() & CASE_CLEAR;
+        DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
+        return;
+    }
+
+    if (info.cr.count() > max_floating_stop_char()) {
+        accel->accel_type = ACCEL_NONE;
+        DEBUG_PRINTF("state %hu is too broad\n", this_idx);
+        return;
+    }
+
+    accel->accel_type = ACCEL_SHUFTI;
+    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo, &accel->shufti.hi)) {
+        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
+        return;
+    }
+
+    assert(!info.cr.none());
+    accel->accel_type = ACCEL_TRUFFLE;
+    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
+    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
 }
+
+map<dstate_id_t, AccelScheme>
+accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
+    map<dstate_id_t, AccelScheme> rv;
+    raw_dfa &rdfa = get_raw();
+    if (!grey.accelerateDFA) {
+        return rv;
+    }
+
+    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
+    DEBUG_PRINTF("sds %hu\n", sds_proxy);
+
+    for (size_t i = 0; i < rdfa.states.size(); i++) {
+        if (i == DEAD_STATE) {
+            continue;
+        }
+
+        /* Note on report acceleration states: While we can't accelerate while
+         * we
+         * are spamming out callbacks, the QR code paths don't raise reports
+         * during scanning so they can accelerate report states. */
+        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
+            continue;
+        }
+
+        size_t single_limit =
+            i == sds_proxy ? max_floating_stop_char() : max_stop_char();
+        DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
+
+        AccelScheme ei = find_escape_strings(i);
+        if (ei.cr.count() > single_limit) {
+            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
+                         ei.cr.count());
+            continue;
+        }
+
+        DEBUG_PRINTF("state %zu should be accelerable %zu\n", i, ei.cr.count());
+
+        rv[i] = ei;
+    }
+
+    /* provide accleration states to states in the region of sds */
+    if (contains(rv, sds_proxy)) {
+        AccelScheme sds_ei = rv[sds_proxy];
+        sds_ei.double_byte.clear(); /* region based on single byte scheme
+                                     * may differ from double byte */
+        DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
+                     sds_ei.cr.count());
+        auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
+        for (auto s : sds_region) {
+            if (!contains(rv, s) || better(sds_ei, rv[s])) {
+                rv[s] = sds_ei;
+            }
+        }
+    }
+
+    return rv;
+}
+};
diff --git a/src/nfa/accel_dfa_build_strat.h b/src/nfa/accel_dfa_build_strat.h
new file mode 100755
index 00000000..3cfaf272
--- /dev/null
+++ b/src/nfa/accel_dfa_build_strat.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ACCEL_DFA_BUILD_STRAT_H
+#define ACCEL_DFA_BUILD_STRAT_H
+
+#include "rdfa.h"
+#include "dfa_build_strat.h"
+#include "ue2common.h"
+#include "util/accel_scheme.h"
+
+#include <map>
+
+namespace ue2 {
+
+class ReportManager;
+struct Grey;
+
+class accel_dfa_build_strat : public dfa_build_strat {
+public:
+    explicit accel_dfa_build_strat(const ReportManager &rm_in)
+        : dfa_build_strat(rm_in) {}
+    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const;
+    virtual size_t accelSize(void) const = 0;
+    virtual u32 max_allowed_offset_accel() const = 0;
+    virtual u32 max_stop_char() const = 0;
+    virtual u32 max_floating_stop_char() const = 0;
+    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
+                            void *accel_out);
+    virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey);
+};
+
+} // namespace ue2
+
+#endif // ACCEL_DFA_BUILD_STRAT_H
diff --git a/src/nfa/callback.h b/src/nfa/callback.h
index dfcd1b9f..9bdaa8d1 100644
--- a/src/nfa/callback.h
+++ b/src/nfa/callback.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,30 +37,26 @@
 
 /** \brief The type for an NFA callback.
  *
- * This is a function that takes as arguments the current offset where the
- * match occurs, the id of the match and the context pointer that was passed
- * into the NFA API function that executed the NFA.
+ * This is a function that takes as arguments the current start and end offsets
+ * where the match occurs, the id of the match and the context pointer that was
+ * passed into the NFA API function that executed the NFA.
  *
- * The offset where the match occurs will be the offset after the character
- * that caused the match. Thus, if we have a buffer containing 'abc', then a
- * pattern that matches an empty string will have an offset of 0, a pattern
- * that matches 'a' will have an offset of 1, and a pattern that matches 'abc'
- * will have an offset of 3, which will be a value that is 'beyond' the size of
- * the buffer. That is, if we have n characters in the buffer, there are n+1
- * different potential offsets for matches.
+ * The start offset is the "start of match" (SOM) offset for the match. It is
+ * only provided by engines that natively support SOM tracking (e.g. Gough).
+ *
+ * The end offset will be the offset after the character that caused the match.
+ * Thus, if we have a buffer containing 'abc', then a pattern that matches an
+ * empty string will have an offset of 0, a pattern that matches 'a' will have
+ * an offset of 1, and a pattern that matches 'abc' will have an offset of 3,
+ * which will be a value that is 'beyond' the size of the buffer. That is, if
+ * we have n characters in the buffer, there are n+1 different potential
+ * offsets for matches.
  *
  * This function should return an int - currently the possible return values
  * are 0, which means 'stop running the engine' or non-zero, which means
  * 'continue matching'.
  */
-typedef int (*NfaCallback)(u64a offset, ReportID id, void *context);
-
-/** \brief The type for an NFA callback which also tracks start of match.
- *
- * see \ref NfaCallback
- */
-typedef int (*SomNfaCallback)(u64a from_offset, u64a to_offset, ReportID id,
-                              void *context);
+typedef int (*NfaCallback)(u64a start, u64a end, ReportID id, void *context);
 
 /**
  * standard \ref NfaCallback return value indicating that engine execution
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 13a44a97..6a72ae31 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -98,7 +98,7 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
     if (match == REPEAT_MATCH) {
         DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset,
                      subIdx, sub->report);
-        if (q->cb(offset, sub->report, q->context) == MO_HALT_MATCHING) {
+        if (q->cb(0, offset, sub->report, q->context) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
     }
@@ -457,7 +457,7 @@ char subCastleFireMatch(const struct Castle *c, const void *full_state,
          i = mmbit_iterate(matching, c->numRepeats, i)) {
         const struct SubCastle *sub = getSubCastle(c, i);
         DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, i);
-        if (cb(offset, sub->report, ctx) == MO_HALT_MATCHING) {
+        if (cb(0, offset, sub->report, ctx) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("caller told us to halt\n");
             return MO_HALT_MATCHING;
         }
@@ -979,6 +979,46 @@ char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
     return castleInAccept(c, q, report, q_cur_offset(q));
 }
 
+char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA_0);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    const u64a offset = q_cur_offset(q);
+    DEBUG_PRINTF("offset=%llu\n", offset);
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            const struct SubCastle *sub = getSubCastle(c, activeIdx);
+            if (subCastleInAccept(c, q, sub->report, offset, activeIdx)) {
+                return 1;
+            }
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            const struct SubCastle *sub = getSubCastle(c, i);
+            if (subCastleInAccept(c, q, sub->report, offset, i)) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
 char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
     assert(n && q);
     assert(n->type == CASTLE_NFA_0);
diff --git a/src/nfa/castle.h b/src/nfa/castle.h
index 8fc3514b..84d79097 100644
--- a/src/nfa/castle.h
+++ b/src/nfa/castle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,6 +44,7 @@ char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecCastle0_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/castle_dump.cpp b/src/nfa/castle_dump.cpp
index dd0e369f..fd1521a5 100644
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,8 @@
 
 namespace ue2 {
 
-void nfaExecCastle0_dumpDot(const struct NFA *, FILE *) {
+void nfaExecCastle0_dumpDot(const struct NFA *, FILE *,
+                            UNUSED const std::string &base) {
     // No GraphViz output for Castles.
 }
 
diff --git a/src/nfa/castle_dump.h b/src/nfa/castle_dump.h
index c0b1f899..94dadec0 100644
--- a/src/nfa/castle_dump.h
+++ b/src/nfa/castle_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,14 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecCastle0_dumpDot(const NFA *nfa, FILE *file);
+void nfaExecCastle0_dumpDot(const NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecCastle0_dumpText(const NFA *nfa, FILE *file);
 
 } // namespace ue2
diff --git a/src/nfa/dfa_build_strat.cpp b/src/nfa/dfa_build_strat.cpp
new file mode 100755
index 00000000..d4d418aa
--- /dev/null
+++ b/src/nfa/dfa_build_strat.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dfa_build_strat.h"
+
+namespace ue2 {
+
+// prevent weak vtables for raw_report_info, dfa_build_strat and raw_dfa
+raw_report_info::~raw_report_info() {}
+
+dfa_build_strat::~dfa_build_strat() {}
+
+raw_dfa::~raw_dfa() {}
+
+} // namespace ue2
diff --git a/src/nfa/dfa_build_strat.h b/src/nfa/dfa_build_strat.h
new file mode 100644
index 00000000..cda00162
--- /dev/null
+++ b/src/nfa/dfa_build_strat.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DFA_BUILD_STRAT_H
+#define DFA_BUILD_STRAT_H
+
+#include "rdfa.h"
+#include "ue2common.h"
+
+#include <memory>
+#include <vector>
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+
+struct raw_report_info {
+    virtual ~raw_report_info();
+    virtual u32 getReportListSize() const = 0; /* in bytes */
+    virtual size_t size() const = 0; /* number of lists */
+    virtual void fillReportLists(NFA *n, size_t base_offset,
+                                 std::vector<u32> &ro /* out */) const = 0;
+};
+
+class dfa_build_strat {
+public:
+    explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
+    virtual ~dfa_build_strat();
+    virtual raw_dfa &get_raw() const = 0;
+    virtual std::unique_ptr<raw_report_info> gatherReports(
+                               std::vector<u32> &reports /* out */,
+                               std::vector<u32> &reports_eod /* out */,
+                               u8 *isSingleReport /* out */,
+                               ReportID *arbReport /* out */) const = 0;
+protected:
+    const ReportManager &rm;
+};
+
+} // namespace ue2
+
+#endif // DFA_BUILD_STRAT_H
diff --git a/src/nfa/gough.c b/src/nfa/gough.c
index c52bca06..520aca93 100644
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@@ -110,7 +110,7 @@ u64a expandSomValue(u32 comp_slot_width, u64a curr_offset,
 }
 
 static really_inline
-char doReports(SomNfaCallback cb, void *ctxt, const struct mcclellan *m,
+char doReports(NfaCallback cb, void *ctxt, const struct mcclellan *m,
                const struct gough_som_info *som, u16 s, u64a loc,
                char eod, u16 * const cached_accept_state,
                u32 * const cached_accept_id, u32 * const cached_accept_som) {
@@ -307,7 +307,7 @@ u16 goughEnableStarts(const struct mcclellan *m, u16 s, u64a som_offset,
 static really_inline
 char goughExec16_i(const struct mcclellan *m, struct gough_som_info *som,
                    u16 *state, const u8 *buf, size_t len, u64a offAdj,
-                   SomNfaCallback cb, void *ctxt, const u8 **c_final,
+                   NfaCallback cb, void *ctxt, const u8 **c_final,
                    enum MatchMode mode) {
     assert(ISALIGNED_N(state, 2));
 
@@ -461,7 +461,7 @@ with_accel:
 static really_inline
 char goughExec8_i(const struct mcclellan *m, struct gough_som_info *som,
                   u8 *state, const u8 *buf, size_t len, u64a offAdj,
-                  SomNfaCallback cb, void *ctxt, const u8 **c_final,
+                  NfaCallback cb, void *ctxt, const u8 **c_final,
                   enum MatchMode mode) {
     u8 s = *state;
     const u8 *c = buf, *c_end = buf + len;
@@ -595,7 +595,7 @@ with_accel:
 static never_inline
 char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som,
                      u8 *state, const u8 *buf, size_t len, u64a offAdj,
-                     SomNfaCallback cb, void *ctxt, const u8 **final_point,
+                     NfaCallback cb, void *ctxt, const u8 **final_point,
                      enum MatchMode mode) {
     return goughExec8_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
                         mode);
@@ -604,7 +604,7 @@ char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som,
 static never_inline
 char goughExec16_i_ni(const struct mcclellan *m, struct gough_som_info *som,
                       u16 *state, const u8 *buf, size_t len, u64a offAdj,
-                      SomNfaCallback cb, void *ctxt, const u8 **final_point,
+                      NfaCallback cb, void *ctxt, const u8 **final_point,
                       enum MatchMode mode) {
     return goughExec16_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
                          mode);
@@ -622,7 +622,7 @@ const struct gough_som_info *getSomInfoConst(const char *state_base) {
 
 static really_inline
 char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
-                      const u8 *hend, SomNfaCallback cb, void *context,
+                      const u8 *hend, NfaCallback cb, void *context,
                       struct mq *q, s64a end, enum MatchMode mode) {
     DEBUG_PRINTF("enter\n");
     struct gough_som_info *som = getSomInfo(q->state);
@@ -755,7 +755,7 @@ char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
 static really_inline
 char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
-                       const u8 *hend, SomNfaCallback cb, void *context,
+                       const u8 *hend, NfaCallback cb, void *context,
                        struct mq *q, s64a end, enum MatchMode mode) {
     struct gough_som_info *som = getSomInfo(q->state);
     assert(n->type == GOUGH_NFA_16);
@@ -887,7 +887,7 @@ char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -899,7 +899,7 @@ char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -911,7 +911,7 @@ char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -923,7 +923,7 @@ char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -935,7 +935,7 @@ char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -952,7 +952,7 @@ char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) {
 char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -994,7 +994,7 @@ char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset,
 
 char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u8 s = *(u8 *)q->state;
     u64a offset = q_cur_offset(q);
@@ -1016,7 +1016,7 @@ char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
 
 char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u16 s = *(u16 *)q->state;
     const struct mstate_aux *aux = get_aux(m, s);
@@ -1048,10 +1048,18 @@ char nfaExecGough16_inAccept(const struct NFA *n, ReportID report,
     return nfaExecMcClellan16_inAccept(n, report, q);
 }
 
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan8_inAnyAccept(n, q);
+}
+
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan16_inAnyAccept(n, q);
+}
+
 static
 char goughCheckEOD(const struct NFA *nfa, u16 s,
                    const struct gough_som_info *som,
-                   u64a offset, SomNfaCallback cb, void *ctxt) {
+                   u64a offset, NfaCallback cb, void *ctxt) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
     const struct mstate_aux *aux = get_aux(m, s);
 
@@ -1062,21 +1070,19 @@ char goughCheckEOD(const struct NFA *nfa, u16 s,
 }
 
 char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
-                          UNUSED const char *streamState, u64a offset,
-                          UNUSED NfaCallback callback,
-                          SomNfaCallback som_callback, void *context) {
+                           UNUSED const char *streamState, u64a offset,
+                           NfaCallback callback, void *context) {
     const struct gough_som_info *som = getSomInfoConst(state);
-    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback,
+    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, callback,
                          context);
 }
 
 char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
-                           UNUSED const char *streamState, u64a offset,
-                           UNUSED NfaCallback callback,
-                           SomNfaCallback som_callback, void *context) {
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback callback, void *context) {
     assert(ISALIGNED_N(state, 8));
     const struct gough_som_info *som = getSomInfoConst(state);
-    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback,
+    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, callback,
                          context);
 }
 
diff --git a/src/nfa/gough.h b/src/nfa/gough.h
index 41d4cb5a..a7f48892 100644
--- a/src/nfa/gough.h
+++ b/src/nfa/gough.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,13 +39,13 @@ struct mq;
 
 char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
                            const char *streamState, u64a offset,
-                           NfaCallback callback, SomNfaCallback som_cb,
-                           void *context);
+                           NfaCallback callback, void *context);
 char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecGough8_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecGough8_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecGough8_initCompressedState(const struct NFA *n, u64a offset,
                                        void *state, u8 key);
@@ -61,13 +61,13 @@ char nfaExecGough8_expandState(const struct NFA *nfa, void *dest,
 
 char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
                             const char *streamState, u64a offset,
-                            NfaCallback callback, SomNfaCallback som_cb,
-                            void *context);
+                            NfaCallback callback, void *context);
 char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecGough16_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecGough16_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecGough16_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 647dc496..314b6fd0 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -79,9 +79,9 @@ namespace {
 class gough_build_strat : public mcclellan_build_strat {
 public:
     gough_build_strat(
-        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm,
+        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm_in,
         const map<dstate_id_t, gough_accel_state_info> &accel_info)
-        : mcclellan_build_strat(r, rm), rdfa(r), gg(g),
+        : mcclellan_build_strat(r, rm_in), rdfa(r), gg(g),
           accel_gough_info(accel_info) {}
     unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
                             vector<u32> &reports_eod /* out */,
diff --git a/src/nfa/goughdump.cpp b/src/nfa/goughdump.cpp
index f4f15eea..4e6e5425 100644
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -259,7 +259,8 @@ void dumpTransitions(const NFA *nfa, FILE *f,
     fprintf(f, "\n");
 }
 
-void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
+void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f,
+                           UNUSED const string &base) {
     assert(nfa->type == GOUGH_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -302,7 +303,8 @@ void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
-void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
+void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f,
+                            UNUSED const string &base) {
     assert(nfa->type == GOUGH_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
diff --git a/src/nfa/goughdump.h b/src/nfa/goughdump.h
index 5e15356d..b96938e4 100644
--- a/src/nfa/goughdump.h
+++ b/src/nfa/goughdump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,12 +33,16 @@
 
 #include "ue2common.h"
 
+#include <string>
+
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecGough8_dumpDot(const NFA *nfa, FILE *file);
-void nfaExecGough16_dumpDot(const NFA *nfa, FILE *file);
+void nfaExecGough8_dumpDot(const NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecGough16_dumpDot(const NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecGough8_dumpText(const NFA *nfa, FILE *file);
 void nfaExecGough16_dumpText(const NFA *nfa, FILE *file);
 
diff --git a/src/nfa/lbr.c b/src/nfa/lbr.c
index 0d69cc2a..07e59239 100644
--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -293,7 +293,7 @@ char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end,
         }
 
         DEBUG_PRINTF("firing match at %llu\n", i);
-        if (cb(i, l->report, ctx) == MO_HALT_MATCHING) {
+        if (cb(0, i, l->report, ctx) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
     }
diff --git a/src/nfa/lbr.h b/src/nfa/lbr.h
index b770477d..a9e42046 100644
--- a/src/nfa/lbr.h
+++ b/src/nfa/lbr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,7 @@ char nfaExecLbrDot_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecLbrDot_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrDot_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecLbrDot_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_initCompressedState(const struct NFA *n, u64a offset,
                                        void *state, u8 key);
@@ -66,6 +67,7 @@ char nfaExecLbrVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrVerm_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrVerm_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
@@ -86,6 +88,7 @@ char nfaExecLbrNVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrNVerm_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_inAccept(const struct NFA *n, ReportID report,
                               struct mq *q);
+char nfaExecLbrNVerm_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_initCompressedState(const struct NFA *n, u64a offset,
                                          void *state, u8 key);
@@ -106,6 +109,7 @@ char nfaExecLbrShuf_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrShuf_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrShuf_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
@@ -126,6 +130,7 @@ char nfaExecLbrTruf_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrTruf_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrTruf_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/lbr_common_impl.h b/src/nfa/lbr_common_impl.h
index 917a8e91..5ae35431 100644
--- a/src/nfa/lbr_common_impl.h
+++ b/src/nfa/lbr_common_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,7 +72,7 @@ char JOIN(ENGINE_EXEC_NAME, _reportCurrent)(const struct NFA *nfa,
     const struct lbr_common *l = getImplNfa(nfa);
     u64a offset = q_cur_offset(q);
     DEBUG_PRINTF("firing match %u at %llu\n", l->report, offset);
-    q->cb(offset, l->report, q->context);
+    q->cb(0, offset, l->report, q->context);
     return 0;
 }
 
@@ -94,6 +94,15 @@ char JOIN(ENGINE_EXEC_NAME, _inAccept)(const struct NFA *nfa,
     return lbrInAccept(l, lstate, q->streamState, offset, report);
 }
 
+char JOIN(ENGINE_EXEC_NAME, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    return JOIN(ENGINE_EXEC_NAME, _inAccept)(nfa, l->report, q);
+}
+
 char JOIN(ENGINE_EXEC_NAME, _queueInitState)(const struct NFA *nfa,
                                              struct mq *q) {
     assert(nfa && q);
@@ -206,7 +215,7 @@ char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q,
 
     if (q->report_current) {
         DEBUG_PRINTF("report_current: fire match at %llu\n", q_cur_offset(q));
-        int rv = q->cb(q_cur_offset(q), l->report, q->context);
+        int rv = q->cb(0, q_cur_offset(q), l->report, q->context);
         q->report_current = 0;
         if (rv == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
diff --git a/src/nfa/lbr_dump.cpp b/src/nfa/lbr_dump.cpp
index 3de75333..3412ddf5 100644
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,23 +49,28 @@
 
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrDot_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                           UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrNVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrNVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                             UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrShuf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrShuf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrTruf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrTruf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
diff --git a/src/nfa/lbr_dump.h b/src/nfa/lbr_dump.h
index 5f6dd261..06ed51e2 100644
--- a/src/nfa/lbr_dump.h
+++ b/src/nfa/lbr_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,16 +32,22 @@
 #ifdef DUMP_SUPPORT
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrVerm_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrNVerm_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrShuf_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrTruf_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecLbrDot_dumpDot(const struct NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecLbrVerm_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrNVerm_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrShuf_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrTruf_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecLbrDot_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecLbrVerm_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecLbrNVerm_dumpText(const struct NFA *nfa, FILE *file);
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index 2c429a67..ad53503c 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -30,6 +30,7 @@
 #define LIMEX_H
 
 #ifdef __cplusplus
+#include <string>
 extern "C"
 {
 #endif
@@ -40,7 +41,8 @@ extern "C"
 #define GENERATE_NFA_DUMP_DECL(gf_name)                                        \
     } /* extern "C" */                                                         \
     namespace ue2 {                                                            \
-    void gf_name##_dumpDot(const struct NFA *nfa, FILE *file);                 \
+    void gf_name##_dumpDot(const struct NFA *nfa, FILE *file,                  \
+                           const std::string &base);                           \
     void gf_name##_dumpText(const struct NFA *nfa, FILE *file);                \
     } /* namespace ue2 */                                                      \
     extern "C" {
@@ -52,14 +54,14 @@ extern "C"
 #define GENERATE_NFA_DECL(gf_name)                                             \
     char gf_name##_testEOD(const struct NFA *nfa, const char *state,           \
                            const char *streamState, u64a offset,               \
-                           NfaCallback callback, SomNfaCallback som_cb,        \
-                           void *context);                                     \
+                           NfaCallback callback, void *context);               \
     char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end);             \
     char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end);            \
     char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report);     \
     char gf_name##_reportCurrent(const struct NFA *n, struct mq *q);           \
     char gf_name##_inAccept(const struct NFA *n, ReportID report,              \
                             struct mq *q);                                     \
+    char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q);             \
     char gf_name##_queueInitState(const struct NFA *n, struct mq *q);          \
     char gf_name##_initCompressedState(const struct NFA *n, u64a offset,       \
                                        void *state, u8 key);                   \
@@ -74,41 +76,11 @@ extern "C"
                                                    struct mq *q, s64a loc);    \
     GENERATE_NFA_DUMP_DECL(gf_name)
 
-GENERATE_NFA_DECL(nfaExecLimEx32_1)
-GENERATE_NFA_DECL(nfaExecLimEx32_2)
-GENERATE_NFA_DECL(nfaExecLimEx32_3)
-GENERATE_NFA_DECL(nfaExecLimEx32_4)
-GENERATE_NFA_DECL(nfaExecLimEx32_5)
-GENERATE_NFA_DECL(nfaExecLimEx32_6)
-GENERATE_NFA_DECL(nfaExecLimEx32_7)
-GENERATE_NFA_DECL(nfaExecLimEx128_1)
-GENERATE_NFA_DECL(nfaExecLimEx128_2)
-GENERATE_NFA_DECL(nfaExecLimEx128_3)
-GENERATE_NFA_DECL(nfaExecLimEx128_4)
-GENERATE_NFA_DECL(nfaExecLimEx128_5)
-GENERATE_NFA_DECL(nfaExecLimEx128_6)
-GENERATE_NFA_DECL(nfaExecLimEx128_7)
-GENERATE_NFA_DECL(nfaExecLimEx256_1)
-GENERATE_NFA_DECL(nfaExecLimEx256_2)
-GENERATE_NFA_DECL(nfaExecLimEx256_3)
-GENERATE_NFA_DECL(nfaExecLimEx256_4)
-GENERATE_NFA_DECL(nfaExecLimEx256_5)
-GENERATE_NFA_DECL(nfaExecLimEx256_6)
-GENERATE_NFA_DECL(nfaExecLimEx256_7)
-GENERATE_NFA_DECL(nfaExecLimEx384_1)
-GENERATE_NFA_DECL(nfaExecLimEx384_2)
-GENERATE_NFA_DECL(nfaExecLimEx384_3)
-GENERATE_NFA_DECL(nfaExecLimEx384_4)
-GENERATE_NFA_DECL(nfaExecLimEx384_5)
-GENERATE_NFA_DECL(nfaExecLimEx384_6)
-GENERATE_NFA_DECL(nfaExecLimEx384_7)
-GENERATE_NFA_DECL(nfaExecLimEx512_1)
-GENERATE_NFA_DECL(nfaExecLimEx512_2)
-GENERATE_NFA_DECL(nfaExecLimEx512_3)
-GENERATE_NFA_DECL(nfaExecLimEx512_4)
-GENERATE_NFA_DECL(nfaExecLimEx512_5)
-GENERATE_NFA_DECL(nfaExecLimEx512_6)
-GENERATE_NFA_DECL(nfaExecLimEx512_7)
+GENERATE_NFA_DECL(nfaExecLimEx32)
+GENERATE_NFA_DECL(nfaExecLimEx128)
+GENERATE_NFA_DECL(nfaExecLimEx256)
+GENERATE_NFA_DECL(nfaExecLimEx384)
+GENERATE_NFA_DECL(nfaExecLimEx512)
 
 #undef GENERATE_NFA_DECL
 #undef GENERATE_NFA_DUMP_DECL
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index 2c73f9ff..28f37083 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -35,6 +35,7 @@
 #include "accel.h"
 #include "limex_internal.h"
 #include "limex_limits.h"
+#include "limex_shuffle.h"
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
@@ -44,10 +45,7 @@
 #include "ue2common.h"
 #include "vermicelli.h"
 #include "util/bitutils.h"
-#include "util/shuffle.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
-#include "util/shuffle_ssse3.h"
 
 static really_inline
 size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
@@ -80,7 +78,7 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
 size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = shuffleDynamic32(s, accel);
+    u32 idx = packedExtract32(s, accel);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -92,7 +90,7 @@ size_t doAccel128(const m128 *state, const struct LimExNFA128 *limex,
     DEBUG_PRINTF("using PSHUFB for 128-bit shuffle\n");
     m128 accelPerm = limex->accelPermute;
     m128 accelComp = limex->accelCompare;
-    idx = shufflePshufb128(s, accelPerm, accelComp);
+    idx = packedExtract128(s, accelPerm, accelComp);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -105,17 +103,13 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
     m256 accelPerm = limex->accelPermute;
     m256 accelComp = limex->accelCompare;
 #if !defined(__AVX2__)
-    u32 idx1 = shufflePshufb128(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = shufflePshufb128(s.hi, accelPerm.hi, accelComp.hi);
-#else
-    // TODO: learn you some avx2 shuffles for great good
-    u32 idx1 = shufflePshufb128(movdq_lo(s), movdq_lo(accelPerm),
-                                movdq_lo(accelComp));
-    u32 idx2 = shufflePshufb128(movdq_hi(s), movdq_hi(accelPerm),
-                                movdq_hi(accelComp));
-#endif
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
     assert((idx1 & idx2) == 0); // should be no shared bits
     idx = idx1 | idx2;
+#else
+    idx = packedExtract256(s, accelPerm, accelComp);
+#endif
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -127,9 +121,9 @@ size_t doAccel384(const m384 *state, const struct LimExNFA384 *limex,
     DEBUG_PRINTF("using PSHUFB for 384-bit shuffle\n");
     m384 accelPerm = limex->accelPermute;
     m384 accelComp = limex->accelCompare;
-    u32 idx1 = shufflePshufb128(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = shufflePshufb128(s.mid, accelPerm.mid, accelComp.mid);
-    u32 idx3 = shufflePshufb128(s.hi, accelPerm.hi, accelComp.hi);
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.mid, accelPerm.mid, accelComp.mid);
+    u32 idx3 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
     assert((idx1 & idx2 & idx3) == 0); // should be no shared bits
     idx = idx1 | idx2 | idx3;
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
@@ -144,21 +138,17 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
     m512 accelPerm = limex->accelPermute;
     m512 accelComp = limex->accelCompare;
 #if !defined(__AVX2__)
-    u32 idx1 = shufflePshufb128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
-    u32 idx2 = shufflePshufb128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
-    u32 idx3 = shufflePshufb128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
-    u32 idx4 = shufflePshufb128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
-#else
-    u32 idx1 = shufflePshufb128(movdq_lo(s.lo), movdq_lo(accelPerm.lo),
-                                movdq_lo(accelComp.lo));
-    u32 idx2 = shufflePshufb128(movdq_hi(s.lo), movdq_hi(accelPerm.lo),
-                                movdq_hi(accelComp.lo));
-    u32 idx3 = shufflePshufb128(movdq_lo(s.hi), movdq_lo(accelPerm.hi),
-                                movdq_lo(accelComp.hi));
-    u32 idx4 = shufflePshufb128(movdq_hi(s.hi), movdq_hi(accelPerm.hi),
-                                movdq_hi(accelComp.hi));
-#endif
+    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
+    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
+    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
+    u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
     assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
     idx = idx1 | idx2 | idx3 | idx4;
+#else
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#endif
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
diff --git a/src/nfa/limex_common_impl.h b/src/nfa/limex_common_impl.h
index 6e4b7718..9523b073 100644
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
 #define TESTEOD_REV_FN      JOIN(moNfaRevTestEod, SIZE)
 #define LIMEX_INACCEPT_FN   JOIN(limexInAccept, SIZE)
+#define LIMEX_INANYACCEPT_FN   JOIN(limexInAnyAccept, SIZE)
 #define EXPIRE_ESTATE_FN    JOIN(limexExpireExtendedState, SIZE)
 #define REPORTCURRENT_FN    JOIN(moNfaReportCurrent, SIZE)
 #define INITIAL_FN          JOIN(moNfaInitial, SIZE)
@@ -118,7 +119,7 @@ char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s,
         if (TESTBIT_STATE(s, a->state)) {
             DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
                          a->state, a->externalId, offset);
-            int rv = callback(offset, a->externalId, context);
+            int rv = callback(0, offset, a->externalId, context);
             if (unlikely(rv == MO_HALT_MATCHING)) {
                 return 1;
             }
@@ -149,7 +150,7 @@ char PROCESS_ACCEPTS_NOSQUASH_FN(const STATE_T *s,
         if (TESTBIT_STATE(s, a->state)) {
             DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
                          a->state, a->externalId, offset);
-            int rv = callback(offset, a->externalId, context);
+            int rv = callback(0, offset, a->externalId, context);
             if (unlikely(rv == MO_HALT_MATCHING)) {
                 return 1;
             }
@@ -374,11 +375,32 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
     return 0;
 }
 
+static really_inline
+char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
+                          union RepeatControl *repeat_ctrl, char *repeat_state,
+                          u64a offset) {
+    assert(limex);
+
+    const STATE_T acceptMask = LOAD_STATE(&limex->accept);
+    STATE_T accstate = AND_STATE(state, acceptMask);
+
+    // Are we in an accept state?
+    if (ISZERO_STATE(accstate)) {
+        DEBUG_PRINTF("no accept states are on\n");
+        return 0;
+    }
+
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accstate);
+
+    return ISNONZERO_STATE(accstate);
+}
+
 #undef TESTEOD_FN
 #undef TESTEOD_REV_FN
 #undef REPORTCURRENT_FN
 #undef EXPIRE_ESTATE_FN
 #undef LIMEX_INACCEPT_FN
+#undef LIMEX_INANYACCEPT_FN
 #undef INITIAL_FN
 #undef TOP_FN
 #undef TOPN_FN
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 5d51feb9..77754e0b 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -167,12 +167,10 @@ struct build_info {
     limex_accel_info accel;
 };
 
+#define LAST_LIMEX_NFA LIMEX_NFA_512
+
 // Constants for scoring mechanism
-
-#define LAST_LIMEX_NFA LIMEX_NFA_512_7
-
-const int LIMEX_INITIAL_SCORE = 2000;
-const int SHIFT_COST = 20; // limex: cost per shift mask
+const int SHIFT_COST = 10; // limex: cost per shift mask
 const int EXCEPTION_COST = 4; // limex: per exception
 
 template<NFAEngineType t> struct NFATraits { };
@@ -261,6 +259,17 @@ void maskSetBits(Mask &m, const NFAStateSet &bits) {
     }
 }
 
+template<class Mask>
+bool isMaskZero(Mask &m) {
+    u8 *m8 = (u8 *)&m;
+    for (u32 i = 0; i < sizeof(m); i++) {
+        if (m8[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
 // Sets an entire byte in a mask to the given value
 template<class Mask>
 void maskSetByte(Mask &m, const unsigned int idx, const char val) {
@@ -336,7 +345,7 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }
 
 struct AccelBuild {
-    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0), ma_len1(0),
+    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0), ma_len1(0),
             ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
     NFAVertex v;
     u32 state;
@@ -999,7 +1008,8 @@ void findMaskedCompressionStates(const build_info &args,
     // Suffixes and outfixes can mask out leaf states, which should all be
     // accepts. Right now we can only do this when there is nothing in initDs,
     // as we switch that on unconditionally in the expand call.
-    if (generates_callbacks(h) && !hasInitDsStates(h, args.state_ids)) {
+    if (!inspects_states_for_accepts(h)
+        && !hasInitDsStates(h, args.state_ids)) {
         NFAStateSet nonleaf(args.num_states);
         for (const auto &e : edges_range(h)) {
             u32 from = args.state_ids.at(source(e, h));
@@ -1162,12 +1172,13 @@ u32 getReportListIndex(const flat_set<ReportID> &reports,
 }
 
 static
-void buildExceptionMap(const build_info &args,
-                       const ue2::unordered_set<NFAEdge> &exceptional,
-                       map<ExceptionProto, vector<u32> > &exceptionMap,
-                       vector<ReportID> &exceptionReports) {
+u32 buildExceptionMap(const build_info &args,
+                      const ue2::unordered_set<NFAEdge> &exceptional,
+                      map<ExceptionProto, vector<u32> > &exceptionMap,
+                      vector<ReportID> &exceptionReports) {
     const NGHolder &h = args.h;
     const u32 num_states = args.num_states;
+    u32 exceptionCount = 0;
 
     ue2::unordered_map<NFAVertex, u32> pos_trigger;
     ue2::unordered_map<NFAVertex, u32> tug_trigger;
@@ -1297,10 +1308,13 @@ void buildExceptionMap(const build_info &args,
             assert(e.succ_states.size() == num_states);
             assert(e.squash_states.size() == num_states);
             exceptionMap[e].push_back(i);
+            exceptionCount++;
         }
     }
 
-    DEBUG_PRINTF("%zu unique exceptions found.\n", exceptionMap.size());
+    DEBUG_PRINTF("%u exceptions found (%zu unique)\n", exceptionCount,
+                 exceptionMap.size());
+    return exceptionCount;
 }
 
 static
@@ -1315,6 +1329,92 @@ u32 depth_to_u32(const depth &d) {
     return d_val;
 }
 
+static
+bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
+                             const build_info &args, u32 maxShift) {
+    NFAVertex from = source(e, h);
+    NFAVertex to = target(e, h);
+    u32 f = args.state_ids.at(from);
+    u32 t = args.state_ids.at(to);
+    if (!isLimitedTransition(f, t, maxShift)) {
+        return true;
+    }
+
+    // All transitions out of a tug trigger are exceptional.
+    if (contains(args.tugs, from)) {
+        return true;
+    }
+    return false;
+}
+
+static
+u32 findMaxVarShift(const build_info &args, u32 nShifts) {
+    const NGHolder &h = args.h;
+    u32 shiftMask = 0;
+    for (const auto &e : edges_range(h)) {
+        u32 from = args.state_ids.at(source(e, h));
+        u32 to = args.state_ids.at(target(e, h));
+        if (from == NO_STATE || to == NO_STATE) {
+            continue;
+        }
+        if (!isExceptionalTransition(h, e, args, MAX_SHIFT_AMOUNT)) {
+            shiftMask |= (1UL << (to - from));
+        }
+    }
+
+    u32 maxVarShift = 0;
+    for (u32 shiftCnt = 0; shiftMask != 0 && shiftCnt < nShifts; shiftCnt++) {
+        maxVarShift = findAndClearLSB_32(&shiftMask);
+    }
+
+    return maxVarShift;
+}
+
+static
+int getLimexScore(const build_info &args, u32 nShifts) {
+    const NGHolder &h = args.h;
+    u32 maxVarShift = nShifts;
+    int score = 0;
+
+    score += SHIFT_COST * nShifts;
+    maxVarShift = findMaxVarShift(args, nShifts);
+
+    NFAStateSet exceptionalStates(args.num_states);
+    for (const auto &e : edges_range(h)) {
+        u32 from = args.state_ids.at(source(e, h));
+        u32 to = args.state_ids.at(target(e, h));
+        if (from == NO_STATE || to == NO_STATE) {
+            continue;
+        }
+        if (isExceptionalTransition(h, e, args, maxVarShift)) {
+            exceptionalStates.set(from);
+        }
+    }
+    score += EXCEPTION_COST * exceptionalStates.count();
+    return score;
+}
+
+// This function finds the best shift scheme with highest score
+// Returns number of shifts and score calculated for appropriate scheme
+// Returns zero if no appropriate scheme was found
+static
+u32 findBestNumOfVarShifts(const build_info &args,
+                           int *bestScoreRet = nullptr) {
+    u32 bestNumOfVarShifts = 0;
+    int bestScore = INT_MAX;
+    for (u32 shiftCount = 1; shiftCount <= MAX_SHIFT_COUNT; shiftCount++) {
+        int score = getLimexScore(args, shiftCount);
+        if (score < bestScore) {
+            bestScore = score;
+            bestNumOfVarShifts = shiftCount;
+        }
+    }
+    if (bestScoreRet != nullptr) {
+        *bestScoreRet = bestScore;
+    }
+    return bestNumOfVarShifts;
+}
+
 template<NFAEngineType dtype>
 struct Factory {
     // typedefs for readability, for types derived from traits
@@ -1322,25 +1422,6 @@ struct Factory {
     typedef typename NFATraits<dtype>::implNFA_t implNFA_t;
     typedef typename NFATraits<dtype>::tableRow_t tableRow_t;
 
-    static
-    bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
-                                 const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                                 const ue2::unordered_set<NFAVertex> &tugs) {
-        NFAVertex from = source(e, h);
-        NFAVertex to = target(e, h);
-        u32 f = state_ids.at(from);
-        u32 t = state_ids.at(to);
-        if (!isLimitedTransition(f, t, NFATraits<dtype>::maxShift)) {
-            return true;
-        }
-
-        // All transitions out of a tug trigger are exceptional.
-        if (contains(tugs, from)) {
-            return true;
-        }
-        return false;
-    }
-
     static
     void allocState(NFA *nfa, u32 repeatscratchStateSize,
                     u32 repeatStreamState) {
@@ -1504,6 +1585,9 @@ struct Factory {
     static
     void writeShiftMasks(const build_info &args, implNFA_t *limex) {
         const NGHolder &h = args.h;
+        u32 maxShift = findMaxVarShift(args, limex->shiftCount);
+        u32 shiftMask = 0;
+        int shiftMaskIdx = 0;
 
         for (const auto &e : edges_range(h)) {
             u32 from = args.state_ids.at(source(e, h));
@@ -1515,15 +1599,32 @@ struct Factory {
             // We check for exceptional transitions here, as we don't want tug
             // trigger transitions emitted as limited transitions (even if they
             // could be in this model).
-            if (!isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
-                maskSetBit(limex->shift[to - from], from);
+            if (!isExceptionalTransition(h, e, args, maxShift)) {
+                u32 shift = to - from;
+                if ((shiftMask & (1UL << shift)) == 0UL) {
+                    shiftMask |= (1UL << shift);
+                    limex->shiftAmount[shiftMaskIdx++] = (u8)shift;
+                }
+                assert(limex->shiftCount <= MAX_SHIFT_COUNT);
+                for (u32 i = 0; i < limex->shiftCount; i++) {
+                    if (limex->shiftAmount[i] == (u8)shift) {
+                        maskSetBit(limex->shift[i], from);
+                        break;
+                    }
+                }
+            }
+        }
+        if (maxShift && limex->shiftCount > 1) {
+            for (u32 i = 0; i < limex->shiftCount; i++) {
+                assert(!isMaskZero(limex->shift[i]));
             }
         }
     }
 
     static
     void findExceptionalTransitions(const build_info &args,
-                                    ue2::unordered_set<NFAEdge> &exceptional) {
+                                    ue2::unordered_set<NFAEdge> &exceptional,
+                                    u32 maxShift) {
         const NGHolder &h = args.h;
 
         for (const auto &e : edges_range(h)) {
@@ -1533,7 +1634,7 @@ struct Factory {
                 continue;
             }
 
-            if (isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
+            if (isExceptionalTransition(h, e, args, maxShift)) {
                 exceptional.insert(e);
             }
         }
@@ -1545,19 +1646,25 @@ struct Factory {
                          implNFA_t *limex, const u32 exceptionsOffset) {
         DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
 
-        // to make testing easier, we pre-set the exceptionMap to all invalid
-        // values
-        memset(limex->exceptionMap, 0xff, sizeof(limex->exceptionMap));
-
         exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
         assert(ISALIGNED(etable));
 
-        u32 ecount = 0;
+        map<u32, ExceptionProto> exception_by_state;
         for (const auto &m : exceptionMap) {
             const ExceptionProto &proto = m.first;
             const vector<u32> &states = m.second;
-            DEBUG_PRINTF("exception %u, triggered by %zu states.\n", ecount,
-                         states.size());
+            for (u32 i : states) {
+                assert(!contains(exception_by_state, i));
+                exception_by_state.emplace(i, proto);
+            }
+        }
+
+        u32 ecount = 0;
+        for (const auto &m : exception_by_state) {
+            const ExceptionProto &proto = m.second;
+            u32 state_id = m.first;
+            DEBUG_PRINTF("exception %u, triggered by state %u\n", ecount,
+                         state_id);
 
             // Write the exception entry.
             exception_t &e = etable[ecount];
@@ -1571,13 +1678,10 @@ struct Factory {
                                     : repeatOffsets[proto.repeat_index];
             e.repeatOffset = repeat_offset;
 
-            // for each state that can switch it on
-            for (auto state_id : states) {
-                // set this bit in the exception mask
-                maskSetBit(limex->exceptionMask, state_id);
-                // set this index in the exception map
-                limex->exceptionMap[state_id] = ecount;
-            }
+            // for the state that can switch it on
+            // set this bit in the exception mask
+            maskSetBit(limex->exceptionMask, state_id);
+
             ecount++;
         }
 
@@ -1778,16 +1882,17 @@ struct Factory {
         }
 
         ue2::unordered_set<NFAEdge> exceptional;
-        findExceptionalTransitions(args, exceptional);
+        u32 shiftCount = findBestNumOfVarShifts(args);
+        assert(shiftCount);
+        u32 maxShift = findMaxVarShift(args, shiftCount);
+        findExceptionalTransitions(args, exceptional, maxShift);
 
         map<ExceptionProto, vector<u32> > exceptionMap;
         vector<ReportID> exceptionReports;
-        buildExceptionMap(args, exceptional, exceptionMap, exceptionReports);
+        u32 exceptionCount = buildExceptionMap(args, exceptional, exceptionMap,
+                                               exceptionReports);
 
-        if (exceptionMap.size() > ~0U) {
-            DEBUG_PRINTF("too many exceptions!\n");
-            return nullptr;
-        }
+        assert(exceptionCount <= args.num_states);
 
         // Build reach table and character mapping.
         vector<NFAStateSet> reach;
@@ -1842,7 +1947,7 @@ struct Factory {
 
         offset = ROUNDUP_CL(offset);
         const u32 exceptionsOffset = offset;
-        offset += sizeof(exception_t) * exceptionMap.size();
+        offset += sizeof(exception_t) * exceptionCount;
 
         const u32 exceptionReportsOffset = offset;
         offset += sizeof(ReportID) * exceptionReports.size();
@@ -1874,6 +1979,7 @@ struct Factory {
         writeAccepts(acceptMask, acceptEodMask, accepts, acceptsEod, squash,
                      limex, acceptsOffset, acceptsEodOffset, squashOffset);
 
+        limex->shiftCount = shiftCount;
         writeShiftMasks(args, limex);
 
         // Determine the state required for our state vector.
@@ -1907,8 +2013,6 @@ struct Factory {
     }
 
     static int score(const build_info &args) {
-        const NGHolder &h = args.h;
-
         // LimEx NFAs are available in sizes from 32 to 512-bit.
         size_t num_states = args.num_states;
 
@@ -1928,45 +2032,17 @@ struct Factory {
             sz = args.cc.grey.nfaForceSize;
         }
 
-        if (args.cc.grey.nfaForceShifts &&
-            NFATraits<dtype>::maxShift != args.cc.grey.nfaForceShifts) {
-            return -1;
-        }
-
         if (sz != NFATraits<dtype>::maxStates) {
             return -1; // fail, size not appropriate
         }
 
         // We are of the right size, calculate a score based on the number
         // of exceptions and the number of shifts used by this LimEx.
-        int score = LIMEX_INITIAL_SCORE;
-        if (NFATraits<dtype>::maxShift != 0) {
-            score -= SHIFT_COST / 2; // first shift mask is cheap
-            score -= SHIFT_COST * (NFATraits<dtype>::maxShift - 1);
+        int score;
+        u32 shiftCount = findBestNumOfVarShifts(args, &score);
+        if (shiftCount == 0) {
+            return -1;
         }
-
-        NFAStateSet exceptionalStates(num_states); // outbound exc trans
-
-        for (const auto &e : edges_range(h)) {
-            u32 from = args.state_ids.at(source(e, h));
-            u32 to = args.state_ids.at(target(e, h));
-            if (from == NO_STATE || to == NO_STATE) {
-                continue;
-            }
-
-            if (isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
-                exceptionalStates.set(from);
-            }
-        }
-        DEBUG_PRINTF("%zu exceptional states\n", exceptionalStates.count());
-        score -= EXCEPTION_COST * exceptionalStates.count();
-
-        /* ensure that we always report a valid score if have the right number
-         * of states */
-        if (score < 0) {
-            score = 0;
-        }
-
         return score;
     }
 };
@@ -1985,50 +2061,19 @@ struct scoreNfa {
     }
 };
 
-#define MAKE_LIMEX_TRAITS(mlt_size, mlt_shift)                          \
-    template<> struct NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift> {   \
-        typedef LimExNFA##mlt_size implNFA_t;                           \
-        typedef u_##mlt_size tableRow_t;                                \
-        typedef NFAException##mlt_size exception_t;                     \
-        static const size_t maxStates = mlt_size;                       \
-        static const u32 maxShift = mlt_shift;                          \
-    };                                                                  \
+#define MAKE_LIMEX_TRAITS(mlt_size)                                            \
+    template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                        \
+        typedef LimExNFA##mlt_size implNFA_t;                                  \
+        typedef u_##mlt_size tableRow_t;                                       \
+        typedef NFAException##mlt_size exception_t;                            \
+        static const size_t maxStates = mlt_size;                              \
+    };
 
-MAKE_LIMEX_TRAITS(32, 1)
-MAKE_LIMEX_TRAITS(32, 2)
-MAKE_LIMEX_TRAITS(32, 3)
-MAKE_LIMEX_TRAITS(32, 4)
-MAKE_LIMEX_TRAITS(32, 5)
-MAKE_LIMEX_TRAITS(32, 6)
-MAKE_LIMEX_TRAITS(32, 7)
-MAKE_LIMEX_TRAITS(128, 1)
-MAKE_LIMEX_TRAITS(128, 2)
-MAKE_LIMEX_TRAITS(128, 3)
-MAKE_LIMEX_TRAITS(128, 4)
-MAKE_LIMEX_TRAITS(128, 5)
-MAKE_LIMEX_TRAITS(128, 6)
-MAKE_LIMEX_TRAITS(128, 7)
-MAKE_LIMEX_TRAITS(256, 1)
-MAKE_LIMEX_TRAITS(256, 2)
-MAKE_LIMEX_TRAITS(256, 3)
-MAKE_LIMEX_TRAITS(256, 4)
-MAKE_LIMEX_TRAITS(256, 5)
-MAKE_LIMEX_TRAITS(256, 6)
-MAKE_LIMEX_TRAITS(256, 7)
-MAKE_LIMEX_TRAITS(384, 1)
-MAKE_LIMEX_TRAITS(384, 2)
-MAKE_LIMEX_TRAITS(384, 3)
-MAKE_LIMEX_TRAITS(384, 4)
-MAKE_LIMEX_TRAITS(384, 5)
-MAKE_LIMEX_TRAITS(384, 6)
-MAKE_LIMEX_TRAITS(384, 7)
-MAKE_LIMEX_TRAITS(512, 1)
-MAKE_LIMEX_TRAITS(512, 2)
-MAKE_LIMEX_TRAITS(512, 3)
-MAKE_LIMEX_TRAITS(512, 4)
-MAKE_LIMEX_TRAITS(512, 5)
-MAKE_LIMEX_TRAITS(512, 6)
-MAKE_LIMEX_TRAITS(512, 7)
+MAKE_LIMEX_TRAITS(32)
+MAKE_LIMEX_TRAITS(128)
+MAKE_LIMEX_TRAITS(256)
+MAKE_LIMEX_TRAITS(384)
+MAKE_LIMEX_TRAITS(512)
 
 } // namespace
 
@@ -2133,20 +2178,18 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
     // Acceleration analysis.
     fillAccelInfo(arg);
 
-    typedef pair<int, NFAEngineType> EngineScore;
-    vector<EngineScore> scores;
+    vector<pair<int, NFAEngineType>> scores;
 
     if (hint != INVALID_NFA) {
         // The caller has told us what to (attempt to) build.
-        scores.push_back(make_pair(0, (NFAEngineType)hint));
+        scores.emplace_back(0, (NFAEngineType)hint);
     } else {
         for (size_t i = 0; i <= LAST_LIMEX_NFA; i++) {
             NFAEngineType ntype = (NFAEngineType)i;
-
             int score = DISPATCH_BY_LIMEX_TYPE(ntype, scoreNfa, arg);
             if (score >= 0) {
                 DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
-                scores.push_back(make_pair(score, ntype));
+                scores.emplace_back(score, ntype);
             }
         }
     }
@@ -2156,22 +2199,22 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
         return nullptr;
     }
 
-    sort(scores.begin(), scores.end(), greater<EngineScore>());
+    // Sort acceptable models in priority order, lowest score first.
+    sort(scores.begin(), scores.end());
 
-    aligned_unique_ptr<NFA> nfa;
-    for (auto i = scores.begin(); !nfa && i != scores.end(); ++i) {
-        assert(i->first >= 0);
-        nfa = DISPATCH_BY_LIMEX_TYPE(i->second, generateNfa, arg);
+    for (const auto &elem : scores) {
+        assert(elem.first >= 0);
+        NFAEngineType limex_model = elem.second;
+        auto nfa = DISPATCH_BY_LIMEX_TYPE(limex_model, generateNfa, arg);
+        if (nfa) {
+            DEBUG_PRINTF("successful build with NFA engine: %s\n",
+                         nfa_type_name(limex_model));
+            return nfa;
+        }
     }
 
-    if (!nfa) {
-        DEBUG_PRINTF("NFA build failed.\n");
-        return nullptr;
-    }
-
-    DEBUG_PRINTF("successful build with NFA engine: %s\n",
-                 nfa_type_name((NFAEngineType)nfa->type));
-    return nfa;
+    DEBUG_PRINTF("NFA build failed.\n");
+    return nullptr;
 }
 
 u32 countAccelStates(NGHolder &h,
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 084f35dd..c52adc46 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,6 +80,23 @@ void dumpMask(FILE *f, const char *name, const u8 *mask, u32 mask_bits) {
     fprintf(f, "MSK %-20s %s\n", name, dumpMask(mask, mask_bits).c_str());
 }
 
+template<typename mask_t>
+static
+u32 rank_in_mask(mask_t mask, u32 bit) {
+    assert(bit < 8 * sizeof(mask));
+
+    u32 chunks[sizeof(mask)/sizeof(u32)];
+    memcpy(chunks, &mask, sizeof(mask));
+    u32 base_rank = 0;
+    for (u32 i = 0; i < bit / 32; i++) {
+        base_rank += popcount32(chunks[i]);
+    }
+    u32 chunk = chunks[bit / 32];
+    u32 local_bit = bit % 32;
+    assert(chunk & (1U << local_bit));
+    return base_rank + popcount32(chunk & ((1U << local_bit) - 1));
+}
+
 template <typename limex_type>
 static
 void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
@@ -244,6 +261,16 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
     }
 }
 
+template<typename limex_type>
+static
+void dumpLimexShifts(const limex_type *limex, FILE *f) {
+    u32 size = limex_traits<limex_type>::size;
+    fprintf(f, "Shift Masks:\n");
+    for(u32 i = 0; i < limex->shiftCount; i++) {
+        fprintf(f, "\t Shift %u(%hhu)\t\tMask: %s\n", i, limex->shiftAmount[i],
+                dumpMask((const u8 *)&limex->shift[i], size).c_str());
+    }
+}
 template<typename limex_type>
 static
 void dumpLimexText(const limex_type *limex, FILE *f) {
@@ -270,6 +297,9 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
         topMask += size / 8;
     }
 
+    // Dump shift masks
+    dumpLimexShifts(limex, f);
+
     dumpSquash(limex, f);
 
     dumpLimexReachMap(limex->reachMap, f);
@@ -325,7 +355,7 @@ struct limex_labeller : public nfa_labeller {
             return;
         }
 
-        u32 ex_index = limex->exceptionMap[state];
+        u32 ex_index = rank_in_mask(limex->exceptionMask, state);
         const typename limex_traits<limex_type>::exception_type *e
             = &exceptions[ex_index];
 
@@ -396,7 +426,7 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
     const typename limex_traits<limex_type>::exception_type *exceptions
         = getExceptionTable(limex);
 
-    u32 ex_index = limex->exceptionMap[state];
+    u32 ex_index = rank_in_mask(limex->exceptionMask, state);
     const typename limex_traits<limex_type>::exception_type *e
         = &exceptions[ex_index];
 
@@ -420,78 +450,45 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
 template<typename limex_type>
 static
 void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
-    for (u32 j = 0; j < MAX_MAX_SHIFT; j++) {
+    for (u32 j = 0; j < limex->shiftCount; j++) {
+        const u32 shift_amount = limex->shiftAmount[j];
         if (testbit((const u8 *)&limex->shift[j],
                     limex_traits<limex_type>::size, state)) {
-            fprintf(f, "%u -> %u;\n", state, state + j);
+            fprintf(f, "%u -> %u;\n", state, state + shift_amount);
         }
     }
 }
 
-#define DUMP_TEXT_FN(ddf_u, ddf_n, ddf_s)                                      \
-    void nfaExecLimEx##ddf_n##_##ddf_s##_dumpText(const NFA *nfa, FILE *f) {   \
+#define DUMP_TEXT_FN(ddf_n)                                                    \
+    void nfaExecLimEx##ddf_n##_dumpText(const NFA *nfa, FILE *f) {             \
         dumpLimexText((const LimExNFA##ddf_n *)getImplNfa(nfa), f);            \
     }
 
-#define DUMP_DOT_FN(ddf_u, ddf_n, ddf_s)                                       \
-    void nfaExecLimEx##ddf_n##_##ddf_s##_dumpDot(const NFA *nfa, FILE *f) {    \
+#define DUMP_DOT_FN(ddf_n)                                                     \
+    void nfaExecLimEx##ddf_n##_dumpDot(const NFA *nfa, FILE *f,                \
+                                       UNUSED const string &base) {            \
         const LimExNFA##ddf_n *limex =                                         \
             (const LimExNFA##ddf_n *)getImplNfa(nfa);                          \
                                                                                \
         dumpDotPreamble(f);                                                    \
-        u32 state_count = nfa->nPositions;                              \
+        u32 state_count = nfa->nPositions;                                     \
         dumpVertexDotInfo(limex, state_count, f,                               \
                           limex_labeller<LimExNFA##ddf_n>(limex));             \
         for (u32 i = 0; i < state_count; i++) {                                \
             dumpLimDotInfo(limex, i, f);                                       \
             dumpExDotInfo(limex, i, f);                                        \
         }                                                                      \
-                                                                               \
         dumpDotTrailer(f);                                                     \
     }
 
-#define LIMEX_DUMP_FNS(ntype, size, shifts)                                    \
-    DUMP_TEXT_FN(ntype, size, shifts)                                          \
-    DUMP_DOT_FN(ntype, size, shifts)
+#define LIMEX_DUMP_FNS(size)                                                   \
+    DUMP_TEXT_FN(size)                                                         \
+    DUMP_DOT_FN(size)
 
-LIMEX_DUMP_FNS(u32, 32, 1)
-LIMEX_DUMP_FNS(u32, 32, 2)
-LIMEX_DUMP_FNS(u32, 32, 3)
-LIMEX_DUMP_FNS(u32, 32, 4)
-LIMEX_DUMP_FNS(u32, 32, 5)
-LIMEX_DUMP_FNS(u32, 32, 6)
-LIMEX_DUMP_FNS(u32, 32, 7)
-
-LIMEX_DUMP_FNS(m128, 128, 1)
-LIMEX_DUMP_FNS(m128, 128, 2)
-LIMEX_DUMP_FNS(m128, 128, 3)
-LIMEX_DUMP_FNS(m128, 128, 4)
-LIMEX_DUMP_FNS(m128, 128, 5)
-LIMEX_DUMP_FNS(m128, 128, 6)
-LIMEX_DUMP_FNS(m128, 128, 7)
-
-LIMEX_DUMP_FNS(m256, 256, 1)
-LIMEX_DUMP_FNS(m256, 256, 2)
-LIMEX_DUMP_FNS(m256, 256, 3)
-LIMEX_DUMP_FNS(m256, 256, 4)
-LIMEX_DUMP_FNS(m256, 256, 5)
-LIMEX_DUMP_FNS(m256, 256, 6)
-LIMEX_DUMP_FNS(m256, 256, 7)
-
-LIMEX_DUMP_FNS(m384, 384, 1)
-LIMEX_DUMP_FNS(m384, 384, 2)
-LIMEX_DUMP_FNS(m384, 384, 3)
-LIMEX_DUMP_FNS(m384, 384, 4)
-LIMEX_DUMP_FNS(m384, 384, 5)
-LIMEX_DUMP_FNS(m384, 384, 6)
-LIMEX_DUMP_FNS(m384, 384, 7)
-
-LIMEX_DUMP_FNS(m512, 512, 1)
-LIMEX_DUMP_FNS(m512, 512, 2)
-LIMEX_DUMP_FNS(m512, 512, 3)
-LIMEX_DUMP_FNS(m512, 512, 4)
-LIMEX_DUMP_FNS(m512, 512, 5)
-LIMEX_DUMP_FNS(m512, 512, 6)
-LIMEX_DUMP_FNS(m512, 512, 7)
+LIMEX_DUMP_FNS(32)
+LIMEX_DUMP_FNS(128)
+LIMEX_DUMP_FNS(256)
+LIMEX_DUMP_FNS(384)
+LIMEX_DUMP_FNS(512)
 
 } // namespace ue2
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index 26c5e5a5..175ca393 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -79,9 +79,13 @@
 #ifdef ARCH_64_BIT
 #define CHUNK_T u64a
 #define FIND_AND_CLEAR_FN findAndClearLSB_64
+#define POPCOUNT_FN popcount64
+#define RANK_IN_MASK_FN rank_in_mask64
 #else
 #define CHUNK_T u32
 #define FIND_AND_CLEAR_FN findAndClearLSB_32
+#define POPCOUNT_FN popcount32
+#define RANK_IN_MASK_FN rank_in_mask32
 #endif
 
 /** \brief Process a single exception. Returns 1 if exception handling should
@@ -206,13 +210,13 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 
 #ifndef RUN_EXCEPTION_FN_ONLY
 
-/** \brief Process all of the exceptions associated with the states in the \a estate. */
+/** \brief Process all of the exceptions associated with the states in the \a
+ * estate. */
 static really_inline
 int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
-          const struct IMPL_NFA_T *limex,
-          const u32 *exceptionMap, const EXCEPTION_T *exceptions,
-          const ReportID *exReports,
-          u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
+          const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
+          const ReportID *exReports, u64a offset, struct CONTEXT_T *ctx,
+          char in_rev, char flags) {
     assert(diffmask > 0); // guaranteed by caller macro
 
     if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
@@ -237,15 +241,23 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 
     // A copy of the estate as an array of GPR-sized chunks.
     CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
 #ifdef ESTATE_ON_STACK
     memcpy(chunks, &estate, sizeof(STATE_T));
 #else
     memcpy(chunks, estatep, sizeof(STATE_T));
 #endif
+    memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
 
     struct proto_cache new_cache = {0, NULL};
     enum CacheResult cacheable = CACHE_RESULT;
 
+    u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    base_index[0] = 0;
+    for (u32 i = 0; i < ARRAY_LENGTH(base_index) - 1; i++) {
+        base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+    }
+
     do {
         u32 t = findAndClearLSB_32(&diffmask);
 #ifdef ARCH_64_BIT
@@ -254,10 +266,10 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
         assert(t < ARRAY_LENGTH(chunks));
         CHUNK_T word = chunks[t];
         assert(word != 0);
-        u32 base = t * sizeof(CHUNK_T) * 8;
         do {
-            u32 bit = FIND_AND_CLEAR_FN(&word) + base;
-            u32 idx = exceptionMap[bit];
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+            u32 idx = local_index + base_index[t];
             const EXCEPTION_T *e = &exceptions[idx];
 
             if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index adae6ab7..c37f5f40 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -68,6 +68,9 @@
     The value of NFA.stateSize gives the total state size in bytes (the sum of
     all the above).
 
+    Number of shifts should be always greater or equal to 1
+    Number of shifts 0 means that no appropriate NFA engine was found.
+
 */
 
 #ifndef LIMEX_INTERNAL_H
@@ -77,7 +80,8 @@
 #include "repeat_internal.h"
 
 // Constants
-#define MAX_MAX_SHIFT 8      /**< largest maxshift used by a LimEx NFA */
+#define MAX_SHIFT_COUNT 8   /**< largest number of shifts used by a LimEx NFA */
+#define MAX_SHIFT_AMOUNT 16 /**< largest shift amount used by a LimEx NFA */
 
 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
@@ -95,24 +99,6 @@ enum LimExSquash {
     LIMEX_SQUASH_REPORT = 3  //!< squash when report is raised
 };
 
-struct LimExNFABase {
-    u8 reachMap[N_CHARS];
-    u32 reachSize;
-    u32 accelCount;
-    u32 accelTableOffset;
-    u32 accelAuxCount;
-    u32 accelAuxOffset;
-    u32 acceptCount;
-    u32 acceptOffset;
-    u32 acceptEodCount;
-    u32 acceptEodOffset;
-    u32 exceptionCount;
-    u32 exceptionOffset;
-    u32 exReportOffset;
-    u32 repeatCount;
-    u32 repeatOffset;
-};
-
 /* uniform looking types for the macros */
 typedef u8   u_8;
 typedef u16  u_16;
@@ -133,7 +119,7 @@ struct NFAException##size {                                                 \
     u8 trigger; /**< from enum LimExTrigger */                              \
 };                                                                          \
                                                                             \
-struct LimExNFA##size { /* MUST align with LimExNFABase */                  \
+struct LimExNFA##size {                                                     \
     u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */           \
     u32 reachSize; /**< number of reach masks */                            \
     u32 accelCount; /**< number of entries in accel table */                \
@@ -149,7 +135,6 @@ struct LimExNFA##size { /* MUST align with LimExNFABase */                  \
     u32 exReportOffset; /* rel. to start of LimExNFA */                     \
     u32 repeatCount;                                                        \
     u32 repeatOffset;                                                       \
-    u32 exceptionMap[size];                                                 \
     u32 squashOffset; /* rel. to start of LimExNFA; for accept squashing */ \
     u32 squashCount;                                                        \
     u32 topCount;                                                           \
@@ -168,8 +153,10 @@ struct LimExNFA##size { /* MUST align with LimExNFABase */                  \
     u_##size compressMask; /**< switch off before compress */               \
     u_##size exceptionMask;                                                 \
     u_##size repeatCyclicMask;                                              \
-    u_##size shift[MAX_MAX_SHIFT];                                          \
     u_##size zombieMask; /**< zombie if in any of the set states */         \
+    u_##size shift[MAX_SHIFT_COUNT];                                        \
+    u32 shiftCount; /**< number of shift masks used */                      \
+    u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */     \
 };
 
 CREATE_NFA_LIMEX(32)
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 471e4bf0..8a0a8acd 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -74,7 +74,6 @@
 static really_inline
 int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
                          const struct LimExNFA32 *limex,
-                         const u32 *exceptionMap,
                          const struct NFAException32 *exceptions,
                          const ReportID *exReports, u64a offset,
                          struct NFAContext32 *ctx, char in_rev, char flags) {
@@ -104,7 +103,7 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
 
     do {
         u32 bit = findAndClearLSB_32(&estate);
-        u32 idx = exceptionMap[bit];
+        u32 idx = rank_in_mask32(limex->exceptionMask, bit);
         const struct NFAException32 *e = &exceptions[idx];
         if (!runException32(e, s, succ, &local_succ, limex, exReports, offset,
                             ctx, &new_cache, &cacheable, in_rev, flags)) {
@@ -132,35 +131,4 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
 
 #define SIZE                32
 #define STATE_T             u32
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               7
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h
index 4e111aa6..e0c182fc 100644
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -73,34 +73,35 @@ struct proto_cache {
 };
 
 // Shift macros for Limited NFAs. Defined in terms of uniform ops.
+// LimExNFAxxx ptr in 'limex' and the current state in 's'
 #define NFA_EXEC_LIM_SHIFT(nels_type, nels_i)                                  \
-    (JOIN(shift_, nels_type)(                                                  \
+    (JOIN(lshift_, nels_type)(                                                 \
         JOIN(and_, nels_type)(s,                                               \
                               JOIN(load_, nels_type)(&limex->shift[nels_i])),  \
-        nels_i))
+        limex->shiftAmount[nels_i]))
 
-// Calculate the (limited model) successors for a given max shift. Assumes
-// LimExNFAxxx ptr in 'l', current state in 's' and successors in 'succ'.
+// Calculate the (limited model) successors for a number of variable shifts.
+// Assumes current state in 's' and successors in 'succ'.
 
-#define NFA_EXEC_GET_LIM_SUCC(gls_type, gls_shift)                             \
+#define NFA_EXEC_GET_LIM_SUCC(gls_type)                                        \
     do {                                                                       \
-        succ =                                                                 \
-            JOIN(and_, gls_type)(s, JOIN(load_, gls_type)(&limex->shift[0]));  \
-        switch (gls_shift) {                                                   \
-        case 7:                                                                \
+        succ = NFA_EXEC_LIM_SHIFT(gls_type, 0);                                \
+        switch (limex->shiftCount) {                                           \
+        case 8:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 7)); \
-        case 6:                                                                \
+        case 7:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 6)); \
-        case 5:                                                                \
+        case 6:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 5)); \
-        case 4:                                                                \
+        case 5:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 4)); \
-        case 3:                                                                \
+        case 4:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 3)); \
-        case 2:                                                                \
+        case 3:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 2)); \
-        case 1:                                                                \
+        case 2:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 1)); \
+        case 1:                                                                \
         case 0:                                                                \
             ;                                                                  \
         }                                                                      \
@@ -129,7 +130,7 @@ int limexRunReports(const ReportID *reports, NfaCallback callback,
     for (; *reports != MO_INVALID_IDX; ++reports) {
         DEBUG_PRINTF("firing report for id %u at offset %llu\n",
                      *reports, offset);
-        int rv = callback(offset, *reports, context);
+        int rv = callback(0, offset, *reports, context);
         if (rv == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 676ed370..881e41fd 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -37,11 +37,11 @@
   * Version 2.0: now with X-Macros, so you get line numbers in your debugger.
   */
 
-#if !defined(SIZE) || !defined(STATE_T) || !defined(SHIFT)
-#  error Must define SIZE and STATE_T and SHIFT in includer.
+#if !defined(SIZE) || !defined(STATE_T)
+#  error Must define SIZE and STATE_T in includer.
 #endif
 
-#define LIMEX_API_ROOT   JOIN(JOIN(JOIN(nfaExecLimEx, SIZE), _), SHIFT)
+#define LIMEX_API_ROOT   JOIN(nfaExecLimEx, SIZE)
 
 #define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
 
@@ -73,6 +73,7 @@
 #define ANDNOT_STATE        JOIN(andnot_, STATE_T)
 #define OR_STATE            JOIN(or_, STATE_T)
 #define TESTBIT_STATE       JOIN(testbit_, STATE_T)
+#define CLEARBIT_STATE      JOIN(clearbit_, STATE_T)
 #define ZERO_STATE          JOIN(zero_, STATE_T)
 #define ISNONZERO_STATE     JOIN(isNonZero_, STATE_T)
 #define ISZERO_STATE        JOIN(isZero_, STATE_T)
@@ -104,8 +105,8 @@
 // continue, 1 if an accept was fired and the user instructed us to halt.
 static really_inline
 char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
-                       const ReportID *exReports, const u32 *exceptionMap,
-                       STATE_T s, const STATE_T emask, size_t i, u64a offset,
+                       const ReportID *exReports, STATE_T s,
+                       const STATE_T emask, size_t i, u64a offset,
                        STATE_T *succ, u64a *final_loc, struct CONTEXT_T *ctx,
                        const char flags, const char in_rev,
                        const char first_match) {
@@ -132,8 +133,8 @@ char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
     char localflags = (!i && !in_rev) ? NO_OUTPUT | FIRST_BYTE : flags;
 
     int rv = JOIN(processExceptional, SIZE)(
-        pass_state, pass_estate, diffmask, succ, limex, exceptionMap,
-        exceptions, exReports, callback_offset, ctx, in_rev, localflags);
+        pass_state, pass_estate, diffmask, succ, limex, exceptions, exReports,
+        callback_offset, ctx, in_rev, localflags);
     if (rv == PE_RV_HALT) {
         return 1; // Halt matching.
     }
@@ -175,7 +176,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
         (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
     const ReportID *exReports = getExReports(limex);
-    const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
 
     /* assert(ISALIGNED_16(exceptions)); */
@@ -201,11 +201,11 @@ without_accel:
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
-                              EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
-                              flags, 0, first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s, EXCEPTION_MASK,
+                              i, offset, &succ, final_loc, ctx, flags, 0,
+                              first_match)) {
             return MO_HALT_MATCHING;
         }
 
@@ -252,11 +252,11 @@ with_accel:
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
-                              EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
-                              flags, 0, first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,  EXCEPTION_MASK,
+                              i, offset, &succ, final_loc, ctx, flags, 0,
+                              first_match)) {
             return MO_HALT_MATCHING;
         }
 
@@ -300,7 +300,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 #endif
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
     const ReportID *exReports = getExReports(limex);
-    const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
 
     /* assert(ISALIGNED_16(exceptions)); */
@@ -318,9 +317,9 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 
         u8 c = input[i-1];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,
                               EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
                               flags, 1, 0)) {
             return MO_HALT_MATCHING;
@@ -349,36 +348,57 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 }
 
 static really_inline
-void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
+void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
                          u64a offset) {
     if (!limex->repeatCount) {
         return;
     }
 
-    // Note: we compress all repeats, as they may have *just* had their
-    // cyclic states switched off a moment ago. TODO: is this required
+    STATE_T s = LOAD_STATE(src);
+
+    if (ISZERO_STATE(AND_STATE(s, LOAD_STATE(&limex->repeatCyclicMask)))) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
 
     const union RepeatControl *ctrl =
         getRepeatControlBaseConst((const char *)src, sizeof(STATE_T));
     char *state_base = (char *)dest + limex->stateSize;
 
     for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        if (!TESTBIT_STATE(&s, info->cyclicState)) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
+        }
+
         const struct RepeatInfo *repeat = getRepeatInfo(info);
+        if (repeatHasMatch(repeat, &ctrl[i], state_base + info->stateOffset,
+                           offset) == REPEAT_STALE) {
+            DEBUG_PRINTF("is stale, clearing state\n");
+            CLEARBIT_STATE(&s, info->cyclicState);
+            continue;
+        }
+
+        DEBUG_PRINTF("packing state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
         repeatPack(state_base + info->packedCtrlOffset, repeat, &ctrl[i],
                    offset);
     }
+
+    STORE_STATE(src, s);
 }
 
 char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n,
-                                                  const struct mq *q,
-                                                  s64a loc) {
+                                               const struct mq *q, s64a loc) {
     void *dest = q->streamState;
-    const void *src = q->state;
+    void *src = q->state;
     u8 key = queue_prev_byte(q, loc);
     const IMPL_NFA_T *limex = getImplNfa(n);
-    COMPRESS_FN(limex, dest, src, key);
     COMPRESS_REPEATS_FN(limex, dest, src, q->offset + loc);
+    COMPRESS_FN(limex, dest, src, key);
     return 0;
 }
 
@@ -389,15 +409,29 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
         return;
     }
 
-    // Note: we expand all repeats, as they may have *just* had their
-    // cyclic states switched off a moment ago. TODO: is this required?
+    // Note: state has already been expanded into 'dest'.
+    const STATE_T cyclics =
+        AND_STATE(LOAD_STATE(dest), LOAD_STATE(&limex->repeatCyclicMask));
+    if (ISZERO_STATE(cyclics)) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
 
     union RepeatControl *ctrl =
         getRepeatControlBase((char *)dest, sizeof(STATE_T));
     const char *state_base = (const char *)src + limex->stateSize;
 
     for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        if (!TESTBIT_STATE(&cyclics, info->cyclicState)) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("unpacking state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
         const struct RepeatInfo *repeat = getRepeatInfo(info);
         repeatUnpack(state_base + info->packedCtrlOffset, repeat, offset,
                      &ctrl[i]);
@@ -650,7 +684,27 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
         ep = MIN(ep, end_abs);
         assert(ep >= sp);
 
-        assert(sp >= offset); // We no longer do history buffer scans here.
+        if (sp < offset) {
+            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
+            assert(offset - sp <= q->hlength);
+            u64a local_ep = MIN(offset, ep);
+            u64a final_look = 0;
+            /* we are starting inside the history buffer */
+            if (STREAMFIRST_FN(limex, q->history + q->hlength + sp - offset,
+                               local_ep - sp, &ctx, sp,
+                               &final_look) == MO_HALT_MATCHING) {
+                DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu "
+                             "offset:%llu\n", final_look, sp, end_abs, offset);
+                assert(q->cur);
+                q->cur--;
+                q->items[q->cur].type = MQE_START;
+                q->items[q->cur].location = sp + final_look - offset;
+                STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+                return MO_MATCHES_PENDING;
+            }
+
+            sp = local_ep;
+        }
 
         if (sp >= ep) {
             goto scan_done;
@@ -789,10 +843,8 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
 }
 
 char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
-                                       const char *streamState, u64a offset,
-                                       NfaCallback callback,
-                                       UNUSED SomNfaCallback som_callback,
-                                       void *context) {
+                                    const char *streamState, u64a offset,
+                                    NfaCallback callback, void *context) {
     assert(n && state);
 
     const IMPL_NFA_T *limex = getImplNfa(n);
@@ -868,6 +920,21 @@ char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
                                      offset, report);
 }
 
+char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(q->state && q->streamState);
+
+    const IMPL_NFA_T *limex = getImplNfa(nfa);
+    union RepeatControl *repeat_ctrl =
+        getRepeatControlBase(q->state, sizeof(STATE_T));
+    char *repeat_state = q->streamState + limex->stateSize;
+    STATE_T state = LOAD_STATE(q->state);
+    u64a offset = q->offset + q_last_loc(q) + 1;
+
+    return JOIN(limexInAnyAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
+                                        offset);
+}
+
 enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
                                                          const struct NFA *nfa,
                                                          struct mq *q,
@@ -920,6 +987,7 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef ANDNOT_STATE
 #undef OR_STATE
 #undef TESTBIT_STATE
+#undef CLEARBIT_STATE
 #undef ZERO_STATE
 #undef ISNONZERO_STATE
 #undef ISZERO_STATE
@@ -935,5 +1003,4 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 // Parameters.
 #undef SIZE
 #undef STATE_T
-#undef SHIFT
 #undef LIMEX_API_ROOT
diff --git a/src/util/shuffle.h b/src/nfa/limex_shuffle.h
similarity index 74%
rename from src/util/shuffle.h
rename to src/nfa/limex_shuffle.h
index ba85fb5d..e45e4331 100644
--- a/src/util/shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,20 +34,19 @@
  * be faster and actually correct if these assumptions don't hold true.
  */
 
-#ifndef SHUFFLE_H
-#define SHUFFLE_H
+#ifndef LIMEX_SHUFFLE_H
+#define LIMEX_SHUFFLE_H
 
-#include "config.h"
-#include "bitutils.h"
-#include "simd_utils.h"
 #include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
 
 #if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
 #define HAVE_PEXT
 #endif
 
 static really_inline
-u32 shuffleDynamic32(u32 x, u32 mask) {
+u32 packedExtract32(u32 x, u32 mask) {
 #if defined(HAVE_PEXT)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u32(x, mask);
@@ -67,7 +66,7 @@ u32 shuffleDynamic32(u32 x, u32 mask) {
 }
 
 static really_inline
-u32 shuffleDynamic64(u64a x, u64a mask) {
+u32 packedExtract64(u64a x, u64a mask) {
 #if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u64(x, mask);
@@ -88,4 +87,24 @@ u32 shuffleDynamic64(u64a x, u64a mask) {
 
 #undef HAVE_PEXT
 
-#endif // SHUFFLE_H
+static really_inline
+u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
+    m128 shuffled = pshufb(s, permute);
+    m128 compared = and128(shuffled, compare);
+    u16 rv = ~movemask128(eq128(compared, shuffled));
+    return (u32)rv;
+}
+
+#if defined(__AVX2__)
+static really_inline
+u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m256 shuffled = vpshufb(s, permute);
+    m256 compared = and256(shuffled, compare);
+    u32 rv = ~movemask256(eq256(compared, shuffled));
+    // stitch the lane-wise results back together
+    return (u32)((rv >> 16) | (rv & 0xffffU));
+}
+#endif // AVX2
+
+#endif // LIMEX_SHUFFLE_H
diff --git a/src/nfa/limex_simd128.c b/src/nfa/limex_simd128.c
index 781c7972..f0fb1dd4 100644
--- a/src/nfa/limex_simd128.c
+++ b/src/nfa/limex_simd128.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,37 +61,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               7
+#define SIZE 128
+#define STATE_T m128
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd256.c b/src/nfa/limex_simd256.c
index b4df1459..57648b69 100644
--- a/src/nfa/limex_simd256.c
+++ b/src/nfa/limex_simd256.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,37 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               7
+#define SIZE 256
+#define STATE_T m256
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd384.c b/src/nfa/limex_simd384.c
index 4b4b44bb..84061f61 100644
--- a/src/nfa/limex_simd384.c
+++ b/src/nfa/limex_simd384.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,37 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               7
+#define SIZE 384
+#define STATE_T m384
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd512b.c b/src/nfa/limex_simd512.c
similarity index 88%
rename from src/nfa/limex_simd512b.c
rename to src/nfa/limex_simd512.c
index a3b705df..a6646d83 100644
--- a/src/nfa/limex_simd512b.c
+++ b/src/nfa/limex_simd512.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,12 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               5
+#define SIZE 512
+#define STATE_T m512
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index 314e88e7..88da27c0 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -42,13 +42,13 @@
 
 static really_inline
 char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
-                     u16 s, u64a loc, char eod, u16 * const cached_accept_state,
-                     u32 * const cached_accept_id) {
+                     u16 s, u64a loc, char eod, u16 *const cached_accept_state,
+                     u32 *const cached_accept_id) {
     DEBUG_PRINTF("reporting state = %hu, loc=%llu, eod %hhu\n",
                  (u16)(s & STATE_MASK), loc, eod);
 
     if (!eod && s == *cached_accept_state) {
-        if (cb(loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
 
@@ -71,7 +71,7 @@ char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
         *cached_accept_id = rl->report[0];
 
         DEBUG_PRINTF("reporting %u\n", rl->report[0]);
-        if (cb(loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
 
@@ -80,7 +80,7 @@ char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
 
     for (u32 i = 0; i < count; i++) {
         DEBUG_PRINTF("reporting %u\n", rl->report[i]);
-        if (cb(loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
     }
@@ -146,7 +146,7 @@ without_accel:
 
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
@@ -186,7 +186,7 @@ with_accel:
 
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
@@ -328,7 +328,7 @@ without_accel:
             u64a loc = (c - 1) - buf + offAdj + 1;
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING;
                 }
             } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
@@ -360,7 +360,7 @@ with_accel:
                 u64a loc = (c - 1) - buf + offAdj + 1;
                 if (single) {
                     DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                    if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                         return MO_HALT_MATCHING;
                     }
                 } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
@@ -475,7 +475,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         int rv;
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            rv = cb(q_cur_offset(q), m->arb_report, context);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -632,7 +632,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         int rv;
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            rv = cb(q_cur_offset(q), m->arb_report, context);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -836,7 +836,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
     if (s >= m->accept_limit_8) {
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            cb(offset, m->arb_report, ctxt);
+            cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -850,7 +850,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
 }
 
 char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u16 s = *(u16 *)q->state;
@@ -864,7 +864,7 @@ char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
     if (aux->accept) {
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            cb(offset, m->arb_report, ctxt);
+            cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -905,7 +905,7 @@ char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
                                 struct mq *q) {
     assert(n && q);
 
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     u8 s = *(u8 *)q->state;
     DEBUG_PRINTF("checking accepts for %hhu\n", s);
     if (s < m->accept_limit_8) {
@@ -915,25 +915,45 @@ char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
     return mcclellanHasAccept(m, get_aux(m, s), report);
 }
 
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+    assert(s < m->accept_limit_8 || get_aux(m, s)->accept);
+
+    return s >= m->accept_limit_8;
+}
 
 char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
                                  struct mq *q) {
     assert(n && q);
 
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     u16 s = *(u16 *)q->state;
     DEBUG_PRINTF("checking accepts for %hu\n", s);
 
     return mcclellanHasAccept(m, get_aux(m, s), report);
 }
 
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
 char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_8);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -947,7 +967,7 @@ char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_16);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -961,7 +981,7 @@ char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_8);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     char rv = nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -980,7 +1000,7 @@ char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_16);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     char rv = nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -996,7 +1016,7 @@ char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) {
 
 char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset,
                                            void *state, UNUSED u8 key) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
     u8 s = offset ? m->start_floating : m->start_anchored;
     if (s) {
         *(u8 *)state = s;
@@ -1007,7 +1027,7 @@ char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset,
 
 char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
                                             void *state, UNUSED u8 key) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
     u16 s = offset ? m->start_floating : m->start_anchored;
     if (s) {
         unaligned_store_u16(state, s);
@@ -1019,7 +1039,7 @@ char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
 void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
                                   const u8 *buf, char top, size_t start_off,
                                   size_t len, NfaCallback cb, void *ctxt) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
 
     u8 s = top ? m->start_anchored : *(u8 *)state;
 
@@ -1037,7 +1057,7 @@ void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
 void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
                                    const u8 *buf, char top, size_t start_off,
                                    size_t len, NfaCallback cb, void *ctxt) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
 
     u16 s = top ? m->start_anchored : unaligned_load_u16(state);
 
@@ -1053,17 +1073,15 @@ void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
 }
 
 char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
-                               UNUSED const char *streamState,
-                               u64a offset, NfaCallback callback,
-                               UNUSED SomNfaCallback som_cb, void *context) {
+                               UNUSED const char *streamState, u64a offset,
+                               NfaCallback callback, void *context) {
     return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback,
                              context);
 }
 
 char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
-                                UNUSED const char *streamState,
-                                u64a offset, NfaCallback callback,
-                                UNUSED SomNfaCallback som_cb, void *context) {
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
     assert(ISALIGNED_N(state, 2));
     return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback,
                              context);
diff --git a/src/nfa/mcclellan.h b/src/nfa/mcclellan.h
index 6b4ec2d5..9c6b3eec 100644
--- a/src/nfa/mcclellan.h
+++ b/src/nfa/mcclellan.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,14 +39,14 @@ struct NFA;
 
 char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
                                const char *streamState, u64a offset,
-                               NfaCallback callback, SomNfaCallback som_cb,
-                               void *context);
+                               NfaCallback callback, void *context);
 char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
                                 struct mq *q);
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_initCompressedState(const struct NFA *n, u64a offset,
                                            void *state, u8 key);
@@ -62,14 +62,14 @@ char nfaExecMcClellan8_expandState(const struct NFA *nfa, void *dest,
 
 char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
                                 const char *streamState, u64a offset,
-                                NfaCallback callback, SomNfaCallback som_cb,
-                                void *context);
+                                NfaCallback callback, void *context);
 char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
                                  struct mq *q);
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_initCompressedState(const struct NFA *n, u64a offset,
                                             void *state, u8 key);
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index a9fbce94..09006d5b 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -32,7 +32,6 @@
 #include "accelcompile.h"
 #include "grey.h"
 #include "mcclellan_internal.h"
-#include "mcclellancompile_accel.h"
 #include "mcclellancompile_util.h"
 #include "nfa_internal.h"
 #include "shufticompile.h"
@@ -65,6 +64,17 @@
 using namespace std;
 using boost::adaptors::map_keys;
 
+#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+
+/** Maximum tolerated number of escape character from an accel state.
+ * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
+ * characters for sets of states */
+#define ACCEL_DFA_MAX_STOP_CHAR 160
+
+/** Maximum tolerated number of escape character from a sds accel state. Larger
+ * than normal states as accelerating sds is important. Matches NFA value */
+#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
+
 namespace ue2 {
 
 namespace /* anon */ {
@@ -75,7 +85,7 @@ struct dstate_extra {
 };
 
 struct dfa_info {
-    dfa_build_strat &strat;
+    accel_dfa_build_strat &strat;
     raw_dfa &raw;
     vector<dstate> &states;
     vector<dstate_extra> extra;
@@ -85,7 +95,7 @@ struct dfa_info {
 
     u8 getAlphaShift() const;
 
-    explicit dfa_info(dfa_build_strat &s)
+    explicit dfa_info(accel_dfa_build_strat &s)
                                 : strat(s),
                                   raw(s.get_raw()),
                                   states(raw.states),
@@ -128,13 +138,6 @@ mstate_aux *getAux(NFA *n, dstate_id_t i) {
     return aux;
 }
 
-static
-bool double_byte_ok(const AccelScheme &info) {
-    return !info.double_byte.empty()
-        && info.double_cr.count() < info.double_byte.size()
-        && info.double_cr.count() <= 2 && !info.double_byte.empty();
-}
-
 static
 void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
     assert((size_t)succ_table % 2 == 0);
@@ -190,120 +193,12 @@ u32 mcclellan_build_strat::max_allowed_offset_accel() const {
     return ACCEL_DFA_MAX_OFFSET_DEPTH;
 }
 
-AccelScheme mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx)
-    const {
-    return find_mcclellan_escape_info(rdfa, this_idx,
-                                      max_allowed_offset_accel());
+u32 mcclellan_build_strat::max_stop_char() const {
+    return ACCEL_DFA_MAX_STOP_CHAR;
 }
 
-/** builds acceleration schemes for states */
-void mcclellan_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
-                                       const AccelScheme &info,
-                                       void *accel_out) {
-    AccelAux *accel = (AccelAux *)accel_out;
-
-    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
-                 info.double_offset);
-    accel->generic.offset = verify_u8(info.offset);
-
-    if (double_byte_ok(info) && info.double_cr.none()
-        && info.double_byte.size() == 1) {
-        accel->accel_type = ACCEL_DVERM;
-        accel->dverm.c1 = info.double_byte.begin()->first;
-        accel->dverm.c2 = info.double_byte.begin()->second;
-        accel->dverm.offset = verify_u8(info.double_offset);
-        DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
-        return;
-    }
-
-    if (double_byte_ok(info) && info.double_cr.none()
-        && (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
-        bool ok = true;
-
-        assert(!info.double_byte.empty());
-        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
-        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
-
-        for (const pair<u8, u8> &p : info.double_byte) {
-            if ((p.first & CASE_CLEAR) != firstC
-             || (p.second & CASE_CLEAR) != secondC) {
-                ok = false;
-                break;
-            }
-        }
-
-        if (ok) {
-            accel->accel_type = ACCEL_DVERM_NOCASE;
-            accel->dverm.c1 = firstC;
-            accel->dverm.c2 = secondC;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
-            return;
-        }
-
-        u8 m1;
-        u8 m2;
-        if (buildDvermMask(info.double_byte, &m1, &m2)) {
-            accel->accel_type = ACCEL_DVERM_MASKED;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            accel->dverm.c1 = info.double_byte.begin()->first & m1;
-            accel->dverm.c2 = info.double_byte.begin()->second & m2;
-            accel->dverm.m1 = m1;
-            accel->dverm.m2 = m2;
-            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
-                         accel->dverm.c1, accel->dverm.c2);
-            return;
-        }
-    }
-
-    if (double_byte_ok(info)
-        && shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
-                                  &accel->dshufti.lo1, &accel->dshufti.hi1,
-                                  &accel->dshufti.lo2, &accel->dshufti.hi2)) {
-        accel->accel_type = ACCEL_DSHUFTI;
-        accel->dshufti.offset = verify_u8(info.double_offset);
-        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
-        return;
-    }
-
-    if (info.cr.none()) {
-        accel->accel_type = ACCEL_RED_TAPE;
-        DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
-                     " from which there is no escape\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() == 1) {
-        accel->accel_type = ACCEL_VERM;
-        accel->verm.c = info.cr.find_first();
-        DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
-        accel->accel_type = ACCEL_VERM_NOCASE;
-        accel->verm.c = info.cr.find_first() & CASE_CLEAR;
-        DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() > ACCEL_DFA_MAX_FLOATING_STOP_CHAR) {
-        accel->accel_type = ACCEL_NONE;
-        DEBUG_PRINTF("state %hu is too broad\n", this_idx);
-        return;
-    }
-
-    accel->accel_type = ACCEL_SHUFTI;
-    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo,
-                               &accel->shufti.hi)) {
-        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
-        return;
-    }
-
-    assert(!info.cr.none());
-    accel->accel_type = ACCEL_TRUFFLE;
-    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
-    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
+u32 mcclellan_build_strat::max_floating_stop_char() const {
+    return ACCEL_DFA_MAX_FLOATING_STOP_CHAR;
 }
 
 static
@@ -343,15 +238,6 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
     }
 }
 
-raw_dfa::~raw_dfa() {
-}
-
-raw_report_info::raw_report_info() {
-}
-
-raw_report_info::~raw_report_info() {
-}
-
 namespace {
 
 struct raw_report_list {
@@ -592,7 +478,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
 
     auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
     map<dstate_id_t, AccelScheme> accel_escape_info
-        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
+            = info.strat.getAccelInfo(cc.grey);
 
     size_t tran_size = (1 << info.getAlphaShift())
         * sizeof(u16) * count_real_states;
@@ -811,7 +697,7 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
 
     auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
     map<dstate_id_t, AccelScheme> accel_escape_info
-        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
+        = info.strat.getAccelInfo(cc.grey);
 
     size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * info.size();
     size_t aux_size = sizeof(mstate_aux) * info.size();
@@ -1053,7 +939,7 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
     return false;
 }
 
-aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
+aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                                            const CompileContext &cc,
                                            set<dstate_id_t> *accel_states) {
     u16 total_daddy = 0;
@@ -1123,12 +1009,9 @@ u32 mcclellanStartReachSize(const raw_dfa *raw) {
     return out.count();
 }
 
-bool has_accel_dfa(const NFA *nfa) {
+bool has_accel_mcclellan(const NFA *nfa) {
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
     return m->has_accel;
 }
 
-dfa_build_strat::~dfa_build_strat() {
-}
-
 } // namespace ue2
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index ba519cac..e6f548a7 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -29,6 +29,7 @@
 #ifndef MCCLELLANCOMPILE_H
 #define MCCLELLANCOMPILE_H
 
+#include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/accel_scheme.h"
@@ -47,48 +48,20 @@ namespace ue2 {
 class ReportManager;
 struct CompileContext;
 
-struct raw_report_info {
-    raw_report_info();
-    virtual ~raw_report_info();
-    virtual u32 getReportListSize() const = 0; /* in bytes */
-    virtual size_t size() const = 0; /* number of lists */
-    virtual void fillReportLists(NFA *n, size_t base_offset,
-                                 std::vector<u32> &ro /* out */) const = 0;
-};
-
-class dfa_build_strat {
-public:
-    explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
-    virtual ~dfa_build_strat();
-    virtual raw_dfa &get_raw() const = 0;
-    virtual std::unique_ptr<raw_report_info> gatherReports(
-                               std::vector<u32> &reports /* out */,
-                               std::vector<u32> &reports_eod /* out */,
-                               u8 *isSingleReport /* out */,
-                               ReportID *arbReport  /* out */) const = 0;
-    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const = 0;
-    virtual size_t accelSize(void) const = 0;
-    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
-                            void *accel_out) = 0;
-protected:
-    const ReportManager &rm;
-};
-
-class mcclellan_build_strat : public dfa_build_strat {
+class mcclellan_build_strat : public accel_dfa_build_strat {
 public:
     mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
-        : dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
     raw_dfa &get_raw() const override { return rdfa; }
     std::unique_ptr<raw_report_info> gatherReports(
                                   std::vector<u32> &reports /* out */,
                                   std::vector<u32> &reports_eod /* out */,
                                   u8 *isSingleReport /* out */,
                                   ReportID *arbReport  /* out */) const override;
-    AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
     size_t accelSize(void) const override;
-    void buildAccel(dstate_id_t this_idx,const AccelScheme &info,
-                    void *accel_out) override;
-    virtual u32 max_allowed_offset_accel() const;
+    u32 max_allowed_offset_accel() const override;
+    u32 max_stop_char() const override;
+    u32 max_floating_stop_char() const override;
 
 private:
     raw_dfa &rdfa;
@@ -103,7 +76,7 @@ mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
 
 /* used internally by mcclellan/haig/gough compile process */
 ue2::aligned_unique_ptr<NFA>
-mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
+mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                    const CompileContext &cc,
                    std::set<dstate_id_t> *accel_states = nullptr);
 
@@ -114,7 +87,7 @@ u32 mcclellanStartReachSize(const raw_dfa *raw);
 
 std::set<ReportID> all_reports(const raw_dfa &rdfa);
 
-bool has_accel_dfa(const NFA *nfa);
+bool has_accel_mcclellan(const NFA *nfa);
 
 } // namespace ue2
 
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index 234574d8..a61a19ab 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -337,62 +337,35 @@ size_t hash_dfa(const raw_dfa &rdfa) {
 }
 
 static
-bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
-    u16 top_remap = raw.alpha_remap[TOP];
-    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-        if (i != top_remap && raw.states[s].next[i] == s) {
+bool can_die_early(const raw_dfa &raw, dstate_id_t s,
+                   map<dstate_id_t, u32> &visited, u32 age_limit) {
+    if (contains(visited, s) && visited[s] >= age_limit) {
+        /* we have already visited (or are in the process of visiting) here with
+         * a looser limit. */
+        return false;
+    }
+    visited[s] = age_limit;
+
+    if (s == DEAD_STATE) {
+        return true;
+    }
+
+    if (age_limit == 0) {
+        return false;
+    }
+
+    for (const auto &next : raw.states[s].next) {
+        if (can_die_early(raw, next, visited, age_limit - 1)) {
             return true;
         }
     }
+
     return false;
 }
 
-dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
-    if (raw.start_floating != DEAD_STATE) {
-        DEBUG_PRINTF("has floating start\n");
-        return raw.start_floating;
-    }
-
-    DEBUG_PRINTF("looking for SDS proxy\n");
-
-    dstate_id_t s = raw.start_anchored;
-
-    if (has_self_loop(s, raw)) {
-        return s;
-    }
-
-    u16 top_remap = raw.alpha_remap[TOP];
-
-    ue2::unordered_set<dstate_id_t> seen;
-    while (true) {
-        seen.insert(s);
-        DEBUG_PRINTF("basis %hu\n", s);
-
-        /* check if we are connected to a state with a self loop */
-        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-            dstate_id_t t = raw.states[s].next[i];
-            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
-                return t;
-            }
-        }
-
-        /* find a neighbour to use as a basis for looking for the sds proxy */
-        dstate_id_t t = DEAD_STATE;
-        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-            dstate_id_t tt = raw.states[s].next[i];
-            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
-                t = tt;
-                break;
-            }
-        }
-
-        if (t == DEAD_STATE) {
-            /* we were unable to find a state to use as a SDS proxy */
-            return DEAD_STATE;
-        }
-
-        s = t;
-    }
+bool can_die_early(const raw_dfa &raw, u32 age_limit) {
+    map<dstate_id_t, u32> visited;
+    return can_die_early(raw, raw.start_anchored, visited, age_limit);
 }
 
 } // namespace ue2
diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h
index 7b6c033a..554c1efd 100644
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -55,7 +55,7 @@ size_t hash_dfa_no_reports(const raw_dfa &rdfa);
 /** \brief Compute a simple hash of this raw_dfa, including its reports. */
 size_t hash_dfa(const raw_dfa &rdfa);
 
-dstate_id_t get_sds_or_proxy(const raw_dfa &raw);
+bool can_die_early(const raw_dfa &raw, u32 age_limit);
 
 } // namespace ue2
 
diff --git a/src/nfa/mcclellandump.cpp b/src/nfa/mcclellandump.cpp
index 52711bf1..dcbb0915 100644
--- a/src/nfa/mcclellandump.cpp
+++ b/src/nfa/mcclellandump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -267,7 +267,8 @@ void dumpDotPreambleDfa(FILE *f) {
     fprintf(f, "0 [style=invis];\n");
 }
 
-void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
+void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f,
+                                UNUSED const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -286,7 +287,8 @@ void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
     fprintf(f, "}\n");
 }
 
-void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f) {
+void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f,
+                               UNUSED const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
diff --git a/src/nfa/mcclellandump.h b/src/nfa/mcclellandump.h
index d74a6b6d..efa61544 100644
--- a/src/nfa/mcclellandump.h
+++ b/src/nfa/mcclellandump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "rdfa.h"
 
 #include <cstdio>
+#include <string>
 
 struct mcclellan;
 struct mstate_aux;
@@ -42,8 +43,10 @@ union AccelAux;
 
 namespace ue2 {
 
-void nfaExecMcClellan8_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecMcClellan16_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecMcClellan8_dumpDot(const struct NFA *nfa, FILE *file,
+                               const std::string &base);
+void nfaExecMcClellan16_dumpDot(const struct NFA *nfa, FILE *file,
+                                const std::string &base);
 void nfaExecMcClellan8_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecMcClellan16_dumpText(const struct NFA *nfa, FILE *file);
 
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index 4bae7b18..c6c8cb88 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -131,7 +131,8 @@ char processReports(const struct mpv *m, u8 *reporters,
                     rl_count++;
                 }
 
-                if (cb(report_offset, curr->report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, report_offset, curr->report, ctxt) ==
+                    MO_HALT_MATCHING) {
                     DEBUG_PRINTF("bailing\n");
                     return MO_HALT_MATCHING;
                 }
@@ -180,7 +181,7 @@ char processReportsForRange(const struct mpv *m, u8 *reporters,
 
     for (size_t i = 2; i <= length; i++) {
         for (u32 j = 0; j < rl_count; j++) {
-            if (cb(first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
+            if (cb(0, first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
                 DEBUG_PRINTF("bailing\n");
                 return MO_HALT_MATCHING;
             }
diff --git a/src/nfa/mpv.h b/src/nfa/mpv.h
index dc5dad6f..a3f90719 100644
--- a/src/nfa/mpv.h
+++ b/src/nfa/mpv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@ struct NFA;
 
 char nfaExecMpv0_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMpv0_reportCurrent(const struct NFA *n, struct mq *q);
-char nfaExecMpv0_inAccept(const struct NFA *n, ReportID report, struct mq *q);
 char nfaExecMpv0_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMpv0_initCompressedState(const struct NFA *n, u64a offset,
                                      void *state, u8 key);
@@ -47,6 +46,7 @@ char nfaExecMpv0_expandState(const struct NFA *nfa, void *dest, const void *src,
 
 #define nfaExecMpv0_testEOD NFA_API_NO_IMPL
 #define nfaExecMpv0_inAccept NFA_API_NO_IMPL
+#define nfaExecMpv0_inAnyAccept NFA_API_NO_IMPL
 #define nfaExecMpv0_QR NFA_API_NO_IMPL
 #define nfaExecMpv0_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */
 #define nfaExecMpv0_B_Reverse NFA_API_NO_IMPL
diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index 504cc677..da21d7cf 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -48,7 +48,8 @@
 
 namespace ue2 {
 
-void nfaExecMpv0_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *file) {
+void nfaExecMpv0_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *file,
+                         UNUSED const std::string &base) {
 }
 
 static really_inline
diff --git a/src/nfa/mpv_dump.h b/src/nfa/mpv_dump.h
index 5dcd9f8b..23910dce 100644
--- a/src/nfa/mpv_dump.h
+++ b/src/nfa/mpv_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,14 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecMpv0_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecMpv0_dumpDot(const struct NFA *nfa, FILE *file,
+                         const std::string &base);
 void nfaExecMpv0_dumpText(const struct NFA *nfa, FILE *file);
 
 } // namespace ue2
diff --git a/src/nfa/multiaccel_compilehelper.cpp b/src/nfa/multiaccel_compilehelper.cpp
index f1cf2a4c..4c1f8101 100644
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -347,9 +347,9 @@ void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
     }
 }
 
-MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
-                                                 unsigned max_len) :
-        cr(ref_cr), offset(off), max_len(max_len) {
+MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr,
+                                                 u32 off, unsigned max_length)
+    : cr(ref_cr), offset(off), max_len(max_length) {
     int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
     accels.resize(accel_num);
 
diff --git a/src/nfa/multishufti_avx2.h b/src/nfa/multishufti_avx2.h
index e9980872..042f5570 100644
--- a/src/nfa/multishufti_avx2.h
+++ b/src/nfa/multishufti_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
diff --git a/src/nfa/multishufti_sse.h b/src/nfa/multishufti_sse.h
index 7ea5946d..0a9b543e 100644
--- a/src/nfa/multishufti_sse.h
+++ b/src/nfa/multishufti_sse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /* Normal SSSE3 shufti */
 
diff --git a/src/nfa/multitruffle.c b/src/nfa/multitruffle.c
index 3af6394a..381bda93 100644
--- a/src/nfa/multitruffle.c
+++ b/src/nfa/multitruffle.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,6 @@
 #include "multitruffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include "multiaccel_common.h"
 
diff --git a/src/nfa/nfa_api.h b/src/nfa/nfa_api.h
index 84f5c4a0..e3f7f743 100644
--- a/src/nfa/nfa_api.h
+++ b/src/nfa/nfa_api.h
@@ -120,6 +120,16 @@ char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state,
  */
 char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
 
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExec() and just dispatches directly to the nfa implementations. It is
+ * intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
+/** Return value indicating that the engine is dead. */
+#define MO_DEAD 0
+
 /** Return value indicating that the engine is alive. */
 #define MO_ALIVE 1
 
@@ -155,6 +165,13 @@ char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
  */
 char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end);
 
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExecToMatch() and just dispatches directly to the nfa
+ * implementations. It is intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
 /**
  * Report matches at the current queue location.
  *
@@ -175,10 +192,16 @@ char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q);
  */
 char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q);
 
+/**
+ * Returns non-zero if the NFA is in any accept state regardless of report
+ * ID.
+ */
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q);
+
 /**
  * Process the queued commands on the given NFA up to end or the first match.
  *
- * Note: This version is meant for rose prefix NFAs:
+ * Note: This version is meant for rose prefix/infix NFAs:
  *  - never uses a callback
  *  - loading of state at a point in history is not special cased
  *
@@ -187,9 +210,9 @@ char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q);
  *        end with some variant of end. The location field of the events must
  *        be monotonically increasing. If not all the data was processed during
  *        the call, the queue is updated to reflect the remaining work.
- * @param report we are interested in, if set at the end of the scan returns
- *        @ref MO_MATCHES_PENDING. If no report is desired, MO_INVALID_IDX should
- *        be passed in.
+ * @param report we are interested in. If the given report will be raised at
+ *        the end location, the function returns @ref MO_MATCHES_PENDING. If no
+ *        match information is desired, MO_INVALID_IDX should be passed in.
  * @return @ref MO_ALIVE if the nfa is still active with no matches pending,
  *         and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not
  *         alive
@@ -205,6 +228,9 @@ char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID report);
  * Runs an NFA in reverse from (buf + buflen) to buf and then from (hbuf + hlen)
  * to hbuf (main buffer and history buffer).
  *
+ * Note: provides the match location as the "end" offset when the callback is
+ * called.
+ *
  * @param nfa engine to run
  * @param offset base offset of buf
  * @param buf main buffer
@@ -229,7 +255,6 @@ char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
  *        (including br region)
  * @param offset the offset to return (via the callback) with each match
  * @param callback the callback to call for each match raised
- * @param som_cb the callback to call for each match raised (Haig)
  * @param context context pointer passed to each callback
  *
  * @return @ref MO_HALT_MATCHING if the user instructed us to halt, otherwise
@@ -237,8 +262,7 @@ char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
  */
 char nfaCheckFinalState(const struct NFA *nfa, const char *state,
                         const char *streamState, u64a offset,
-                        NfaCallback callback, SomNfaCallback som_cb,
-                        void *context);
+                        NfaCallback callback, void *context);
 
 /**
  * Indicates if an engine is a zombie.
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index fb27e4eb..c67103b3 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -42,6 +42,8 @@
 #include "limex.h"
 #include "mcclellan.h"
 #include "mpv.h"
+#include "sheng.h"
+#include "tamarama.h"
 
 #define DISPATCH_CASE(dc_ltype, dc_ftype, dc_subtype, dc_func_call) \
     case dc_ltype##_NFA_##dc_subtype:                               \
@@ -52,41 +54,11 @@
 
 #define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
     switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_1, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_2, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_3, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_4, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_5, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_6, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_7, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_7, dbnt_func);    \
+        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
+        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
         DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
         DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
         DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
@@ -98,21 +70,22 @@
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
+        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
     }
 
 char nfaCheckFinalState(const struct NFA *nfa, const char *state,
                         const char *streamState, u64a offset,
-                        NfaCallback callback, SomNfaCallback som_cb,
-                        void *context) {
+                        NfaCallback callback, void *context) {
     assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
 
     // Caller should avoid calling us if we can never produce matches.
     assert(nfaAcceptsEod(nfa));
 
     DISPATCH_BY_NFA_TYPE(_testEOD(nfa, state, streamState, offset, callback,
-                                  som_cb, context));
+                                  context));
     return 0;
 }
 
@@ -135,6 +108,14 @@ char nfaQueueExec2_i(const struct NFA *nfa, struct mq *q, s64a end) {
     return 0;
 }
 
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec_i(nfa, q, end);
+}
+
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec2_i(nfa, q, end);
+}
+
 static really_inline
 char nfaQueueExecRose_i(const struct NFA *nfa, struct mq *q, ReportID report) {
     DISPATCH_BY_NFA_TYPE(_QR(nfa, q, report));
@@ -258,7 +239,6 @@ char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end) {
 
     assert(q);
     assert(end >= 0);
-    assert(q->context);
     assert(q->state);
     assert(q->cur < q->end);
     assert(q->end <= MAX_MQE_LEN);
@@ -315,6 +295,11 @@ char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q) {
     return 0;
 }
 
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q) {
+    DISPATCH_BY_NFA_TYPE(_inAnyAccept(nfa, q));
+    return 0;
+}
+
 char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID r) {
     DEBUG_PRINTF("nfa=%p\n", nfa);
 #ifdef DEBUG
diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index 1373425d..e3579a7e 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -91,12 +91,12 @@ struct mq {
                         * history buffer; (logically) immediately before the
                         * main buffer */
     size_t hlength; /**< length of the history buffer */
+    struct hs_scratch *scratch; /**< global scratch space */
     char report_current; /**<
                           * report_current matches at starting offset through
                           * callback. If true, the queue must be located at a
                           * point where MO_MATCHES_PENDING was returned */
     NfaCallback cb; /**< callback to trigger on matches */
-    SomNfaCallback som_cb; /**< callback with som info;  used by haig */
     void *context; /**< context to pass along with a callback */
     struct mq_item items[MAX_MQE_LEN]; /**< queue items */
 };
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 2ac0505e..93376b01 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 
 #include "limex_internal.h"
 #include "mcclellancompile.h"
+#include "shengcompile.h"
 #include "nfa_internal.h"
 #include "repeat_internal.h"
 #include "ue2common.h"
@@ -78,7 +79,7 @@ struct DISPATCH_BY_NFA_TYPE_INT<sfunc, rv_t, arg_t, INVALID_NFA> {
                              decltype(arg), (NFAEngineType)0>::doOp(i, arg)
 }
 
-typedef bool (*has_accel_fn)(const NFA *nfa);
+typedef bool (*nfa_dispatch_fn)(const NFA *nfa);
 
 template<typename T>
 static
@@ -87,8 +88,37 @@ bool has_accel_limex(const NFA *nfa) {
     return limex->accelCount;
 }
 
+template<typename T>
 static
-bool has_accel_generic(const NFA *) {
+bool has_repeats_limex(const NFA *nfa) {
+    const T *limex = (const T *)getImplNfa(nfa);
+    return limex->repeatCount;
+}
+
+
+template<typename T>
+static
+bool has_repeats_other_than_firsts_limex(const NFA *nfa) {
+    const T *limex = (const T *)getImplNfa(nfa);
+    const char *ptr = (const char *)limex;
+
+    const u32 *repeatOffset = (const u32 *)(ptr + limex->repeatOffset);
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        u32 offset = repeatOffset[i];
+        const NFARepeatInfo *info = (const NFARepeatInfo *)(ptr + offset);
+        const RepeatInfo *repeat =
+            (const RepeatInfo *)((const char *)info + sizeof(*info));
+        if (repeat->type != REPEAT_FIRST) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static
+bool dispatch_false(const NFA *) {
     return false;
 }
 
@@ -140,72 +170,53 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
 #define DO_IF_DUMP_SUPPORT(a)
 #endif
 
-#define MAKE_LIMEX_TRAITS(mlt_size, mlt_shift)                          \
-    template<> struct NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift> {   \
+#define MAKE_LIMEX_TRAITS(mlt_size)                                     \
+    template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                 \
         static UNUSED const char *name;                                 \
         static const NFACategory category = NFA_LIMEX;                  \
         typedef LimExNFA##mlt_size implNFA_t;                           \
         typedef u_##mlt_size tableRow_t;                                \
-        static const has_accel_fn has_accel;                            \
+        static const nfa_dispatch_fn has_accel;                         \
+        static const nfa_dispatch_fn has_repeats;                       \
+        static const nfa_dispatch_fn has_repeats_other_than_firsts;     \
         static const u32 stateAlign =                                   \
                 MAX(alignof(tableRow_t), alignof(RepeatControl));       \
         static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
-    const has_accel_fn NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift>::has_accel \
+    const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel    \
             = has_accel_limex<LimExNFA##mlt_size>;                      \
+    const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_repeats  \
+            = has_repeats_limex<LimExNFA##mlt_size>;                    \
+    const nfa_dispatch_fn                                               \
+        NFATraits<LIMEX_NFA_##mlt_size>::has_repeats_other_than_firsts  \
+            = has_repeats_other_than_firsts_limex<LimExNFA##mlt_size>;  \
     DO_IF_DUMP_SUPPORT(                                                 \
-    const char *NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift>::name     \
-        = "LimEx (0-"#mlt_shift") "#mlt_size;                           \
-    template<> struct getDescription<LIMEX_NFA_##mlt_size##_##mlt_shift> { \
-        static string call(const void *ptr) {                            \
-            return getDescriptionLimEx<LIMEX_NFA_##mlt_size##_##mlt_shift>((const NFA *)ptr); \
+    const char *NFATraits<LIMEX_NFA_##mlt_size>::name                   \
+        = "LimEx "#mlt_size;                                            \
+    template<> struct getDescription<LIMEX_NFA_##mlt_size> {            \
+        static string call(const void *ptr) {                           \
+            return getDescriptionLimEx<LIMEX_NFA_##mlt_size>((const NFA *)ptr); \
         } \
     };)
 
-MAKE_LIMEX_TRAITS(32, 1)
-MAKE_LIMEX_TRAITS(32, 2)
-MAKE_LIMEX_TRAITS(32, 3)
-MAKE_LIMEX_TRAITS(32, 4)
-MAKE_LIMEX_TRAITS(32, 5)
-MAKE_LIMEX_TRAITS(32, 6)
-MAKE_LIMEX_TRAITS(32, 7)
-MAKE_LIMEX_TRAITS(128, 1)
-MAKE_LIMEX_TRAITS(128, 2)
-MAKE_LIMEX_TRAITS(128, 3)
-MAKE_LIMEX_TRAITS(128, 4)
-MAKE_LIMEX_TRAITS(128, 5)
-MAKE_LIMEX_TRAITS(128, 6)
-MAKE_LIMEX_TRAITS(128, 7)
-MAKE_LIMEX_TRAITS(256, 1)
-MAKE_LIMEX_TRAITS(256, 2)
-MAKE_LIMEX_TRAITS(256, 3)
-MAKE_LIMEX_TRAITS(256, 4)
-MAKE_LIMEX_TRAITS(256, 5)
-MAKE_LIMEX_TRAITS(256, 6)
-MAKE_LIMEX_TRAITS(256, 7)
-MAKE_LIMEX_TRAITS(384, 1)
-MAKE_LIMEX_TRAITS(384, 2)
-MAKE_LIMEX_TRAITS(384, 3)
-MAKE_LIMEX_TRAITS(384, 4)
-MAKE_LIMEX_TRAITS(384, 5)
-MAKE_LIMEX_TRAITS(384, 6)
-MAKE_LIMEX_TRAITS(384, 7)
-MAKE_LIMEX_TRAITS(512, 1)
-MAKE_LIMEX_TRAITS(512, 2)
-MAKE_LIMEX_TRAITS(512, 3)
-MAKE_LIMEX_TRAITS(512, 4)
-MAKE_LIMEX_TRAITS(512, 5)
-MAKE_LIMEX_TRAITS(512, 6)
-MAKE_LIMEX_TRAITS(512, 7)
+MAKE_LIMEX_TRAITS(32)
+MAKE_LIMEX_TRAITS(128)
+MAKE_LIMEX_TRAITS(256)
+MAKE_LIMEX_TRAITS(384)
+MAKE_LIMEX_TRAITS(512)
 
 template<> struct NFATraits<MCCLELLAN_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_mcclellan;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MCCLELLAN_NFA_8>::name = "McClellan 8";
 #endif
@@ -215,9 +226,13 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_mcclellan;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MCCLELLAN_NFA_16>::name = "McClellan 16";
 #endif
@@ -227,9 +242,13 @@ template<> struct NFATraits<GOUGH_NFA_8> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_mcclellan;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<GOUGH_NFA_8>::name = "Goughfish 8";
 #endif
@@ -239,9 +258,13 @@ template<> struct NFATraits<GOUGH_NFA_16> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_mcclellan;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<GOUGH_NFA_16>::name = "Goughfish 16";
 #endif
@@ -251,9 +274,13 @@ template<> struct NFATraits<MPV_NFA_0> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MPV_NFA_0>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MPV_NFA_0>::name = "Mega-Puff-Vac";
 #endif
@@ -263,9 +290,13 @@ template<> struct NFATraits<CASTLE_NFA_0> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<CASTLE_NFA_0>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<CASTLE_NFA_0>::name = "Castle";
 #endif
@@ -275,9 +306,13 @@ template<> struct NFATraits<LBR_NFA_Dot> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Dot>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Dot>::name = "Lim Bounded Repeat (D)";
 #endif
@@ -287,9 +322,13 @@ template<> struct NFATraits<LBR_NFA_Verm> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Verm>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Verm>::name = "Lim Bounded Repeat (V)";
 #endif
@@ -299,9 +338,13 @@ template<> struct NFATraits<LBR_NFA_NVerm> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_NVerm>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_NVerm>::name = "Lim Bounded Repeat (NV)";
 #endif
@@ -311,9 +354,13 @@ template<> struct NFATraits<LBR_NFA_Shuf> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Shuf>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Shuf>::name = "Lim Bounded Repeat (S)";
 #endif
@@ -323,13 +370,49 @@ template<> struct NFATraits<LBR_NFA_Truf> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Truf>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Truf>::name = "Lim Bounded Repeat (M)";
 #endif
 
+template<> struct NFATraits<SHENG_NFA_0> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_0>::name = "Sheng";
+#endif
+
+template<> struct NFATraits<TAMARAMA_NFA_0> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 32;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<TAMARAMA_NFA_0>::name = "Tamarama";
+#endif
+
 } // namespace
 
 #if defined(DUMP_SUPPORT)
@@ -380,42 +463,39 @@ struct is_limex {
 };
 }
 
+namespace {
+template<NFAEngineType t>
+struct has_repeats_other_than_firsts_dispatch {
+    static nfa_dispatch_fn call(const void *) {
+        return NFATraits<t>::has_repeats_other_than_firsts;
+    }
+};
+}
+
 bool has_bounded_repeats_other_than_firsts(const NFA &nfa) {
-    if (!DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, is_limex, &nfa)) {
-        return false;
+    return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type,
+                                has_repeats_other_than_firsts_dispatch,
+                                &nfa)(&nfa);
+}
+
+namespace {
+template<NFAEngineType t>
+struct has_repeats_dispatch {
+    static nfa_dispatch_fn call(const void *) {
+        return NFATraits<t>::has_repeats;
     }
-
-    const LimExNFABase *limex = (const LimExNFABase *)getImplNfa(&nfa);
-    const char *ptr = (const char *)limex;
-
-    const u32 *repeatOffset = (const u32 *)(ptr + limex->repeatOffset);
-
-    for (u32 i = 0; i < limex->repeatCount; i++) {
-        u32 offset = repeatOffset[i];
-        const NFARepeatInfo *info = (const NFARepeatInfo *)(ptr + offset);
-        const RepeatInfo *repeat =
-            (const RepeatInfo *)((const char *)info + sizeof(*info));
-        if (repeat->type != REPEAT_FIRST) {
-            return true;
-        }
-    }
-
-    return false;
+};
 }
 
 bool has_bounded_repeats(const NFA &nfa) {
-    if (!DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, is_limex, &nfa)) {
-        return false;
-    }
-
-    const LimExNFABase *limex = (const LimExNFABase *)getImplNfa(&nfa);
-    return limex->repeatCount;
+    return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, has_repeats_dispatch,
+                                &nfa)(&nfa);
 }
 
 namespace {
 template<NFAEngineType t>
 struct has_accel_dispatch {
-    static has_accel_fn call(const void *) {
+    static nfa_dispatch_fn call(const void *) {
         return NFATraits<t>::has_accel;
     }
 };
@@ -423,8 +503,7 @@ struct has_accel_dispatch {
 
 bool has_accel(const NFA &nfa) {
     return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, has_accel_dispatch,
-                                &nfa)
-        (&nfa);
+                                &nfa)(&nfa);
 }
 
 bool requires_decompress_key(const NFA &nfa) {
diff --git a/src/nfa/nfa_dump_api.h b/src/nfa/nfa_dump_api.h
index 8675dd5d..1054a204 100644
--- a/src/nfa/nfa_dump_api.h
+++ b/src/nfa/nfa_dump_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
@@ -45,7 +46,7 @@ namespace ue2 {
  * \brief Dump (in Graphviz 'dot' format) a representation of the NFA into the
  * file pointed to by dotFile.
  */
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile);
+void nfaDumpDot(const struct NFA *nfa, FILE *dotFile, const std::string &base);
 
 /** \brief Dump a textual representation of the NFA. */
 void nfaDumpText(const struct NFA *fact, FILE *textFile);
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 4a59dc1e..388ac003 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,8 @@
 #include "limex.h"
 #include "mcclellandump.h"
 #include "mpv_dump.h"
+#include "shengdump.h"
+#include "tamarama_dump.h"
 
 #ifndef DUMP_SUPPORT
 #error "no dump support"
@@ -57,41 +59,11 @@ namespace ue2 {
 #define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
     DEBUG_PRINTF("dispatch for NFA type %u\n", nfa->type);    \
     switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_1, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_2, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_3, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_4, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_5, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_6, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_7, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_7, dbnt_func);    \
+        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
+        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
         DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
         DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
         DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
@@ -103,12 +75,15 @@ namespace ue2 {
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
+        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
     }
 
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile) {
-    DISPATCH_BY_NFA_TYPE(_dumpDot(nfa, dotFile));
+void nfaDumpDot(const struct NFA *nfa, FILE *dotFile,
+                const std::string &base) {
+    DISPATCH_BY_NFA_TYPE(_dumpDot(nfa, dotFile, base));
 }
 
 void nfaDumpText(const struct NFA *nfa, FILE *txtFile) {
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 089e9683..41fee73e 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -51,41 +51,11 @@ extern "C"
 // Common data structures for NFAs
 
 enum NFAEngineType {
-    LIMEX_NFA_32_1,
-    LIMEX_NFA_32_2,
-    LIMEX_NFA_32_3,
-    LIMEX_NFA_32_4,
-    LIMEX_NFA_32_5,
-    LIMEX_NFA_32_6,
-    LIMEX_NFA_32_7,
-    LIMEX_NFA_128_1,
-    LIMEX_NFA_128_2,
-    LIMEX_NFA_128_3,
-    LIMEX_NFA_128_4,
-    LIMEX_NFA_128_5,
-    LIMEX_NFA_128_6,
-    LIMEX_NFA_128_7,
-    LIMEX_NFA_256_1,
-    LIMEX_NFA_256_2,
-    LIMEX_NFA_256_3,
-    LIMEX_NFA_256_4,
-    LIMEX_NFA_256_5,
-    LIMEX_NFA_256_6,
-    LIMEX_NFA_256_7,
-    LIMEX_NFA_384_1,
-    LIMEX_NFA_384_2,
-    LIMEX_NFA_384_3,
-    LIMEX_NFA_384_4,
-    LIMEX_NFA_384_5,
-    LIMEX_NFA_384_6,
-    LIMEX_NFA_384_7,
-    LIMEX_NFA_512_1,
-    LIMEX_NFA_512_2,
-    LIMEX_NFA_512_3,
-    LIMEX_NFA_512_4,
-    LIMEX_NFA_512_5,
-    LIMEX_NFA_512_6,
-    LIMEX_NFA_512_7,
+    LIMEX_NFA_32,
+    LIMEX_NFA_128,
+    LIMEX_NFA_256,
+    LIMEX_NFA_384,
+    LIMEX_NFA_512,
     MCCLELLAN_NFA_8,    /**< magic pseudo nfa */
     MCCLELLAN_NFA_16,   /**< magic pseudo nfa */
     GOUGH_NFA_8,        /**< magic pseudo nfa */
@@ -97,6 +67,8 @@ enum NFAEngineType {
     LBR_NFA_Shuf,       /**< magic pseudo nfa */
     LBR_NFA_Truf,       /**< magic pseudo nfa */
     CASTLE_NFA_0,       /**< magic pseudo nfa */
+    SHENG_NFA_0,        /**< magic pseudo nfa */
+    TAMARAMA_NFA_0,     /**< magic nfa container */
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -175,50 +147,27 @@ static really_inline int isGoughType(u8 t) {
     return t == GOUGH_NFA_8 || t == GOUGH_NFA_16;
 }
 
-/** \brief True if the given type (from NFA::type) is a McClellan or Gough DFA.
- * */
+/** \brief True if the given type (from NFA::type) is a Sheng DFA. */
+static really_inline int isShengType(u8 t) {
+    return t == SHENG_NFA_0;
+}
+
+/**
+ * \brief True if the given type (from NFA::type) is a McClellan, Gough or
+ * Sheng DFA.
+ */
 static really_inline int isDfaType(u8 t) {
-    return isMcClellanType(t) || isGoughType(t);
+    return isMcClellanType(t) || isGoughType(t) || isShengType(t);
 }
 
 /** \brief True if the given type (from NFA::type) is an NFA. */
 static really_inline int isNfaType(u8 t) {
     switch (t) {
-    case LIMEX_NFA_32_1:
-    case LIMEX_NFA_32_2:
-    case LIMEX_NFA_32_3:
-    case LIMEX_NFA_32_4:
-    case LIMEX_NFA_32_5:
-    case LIMEX_NFA_32_6:
-    case LIMEX_NFA_32_7:
-    case LIMEX_NFA_128_1:
-    case LIMEX_NFA_128_2:
-    case LIMEX_NFA_128_3:
-    case LIMEX_NFA_128_4:
-    case LIMEX_NFA_128_5:
-    case LIMEX_NFA_128_6:
-    case LIMEX_NFA_128_7:
-    case LIMEX_NFA_256_1:
-    case LIMEX_NFA_256_2:
-    case LIMEX_NFA_256_3:
-    case LIMEX_NFA_256_4:
-    case LIMEX_NFA_256_5:
-    case LIMEX_NFA_256_6:
-    case LIMEX_NFA_256_7:
-    case LIMEX_NFA_384_1:
-    case LIMEX_NFA_384_2:
-    case LIMEX_NFA_384_3:
-    case LIMEX_NFA_384_4:
-    case LIMEX_NFA_384_5:
-    case LIMEX_NFA_384_6:
-    case LIMEX_NFA_384_7:
-    case LIMEX_NFA_512_1:
-    case LIMEX_NFA_512_2:
-    case LIMEX_NFA_512_3:
-    case LIMEX_NFA_512_4:
-    case LIMEX_NFA_512_5:
-    case LIMEX_NFA_512_6:
-    case LIMEX_NFA_512_7:
+    case LIMEX_NFA_32:
+    case LIMEX_NFA_128:
+    case LIMEX_NFA_256:
+    case LIMEX_NFA_384:
+    case LIMEX_NFA_512:
         return 1;
     default:
         break;
@@ -233,6 +182,12 @@ int isLbrType(u8 t) {
            t == LBR_NFA_Shuf || t == LBR_NFA_Truf;
 }
 
+/** \brief True if the given type (from NFA::type) is a container engine. */
+static really_inline
+int isContainerType(u8 t) {
+    return t == TAMARAMA_NFA_0;
+}
+
 static really_inline
 int isMultiTopType(u8 t) {
     return !isDfaType(t) && !isLbrType(t);
diff --git a/src/nfa/nfa_kind.h b/src/nfa/nfa_kind.h
index 46d0bc4c..f2ac6189 100644
--- a/src/nfa/nfa_kind.h
+++ b/src/nfa/nfa_kind.h
@@ -37,6 +37,8 @@
 
 #include "ue2common.h"
 
+#include <string>
+
 namespace ue2 {
 
 /** \brief Specify the use-case for an nfa engine. */
@@ -47,6 +49,7 @@ enum nfa_kind {
     NFA_OUTFIX,  //!< "outfix" nfa not triggered by external events
     NFA_OUTFIX_RAW, //!< "outfix", but with unmanaged reports
     NFA_REV_PREFIX, //! reverse running prefixes (for som)
+    NFA_EAGER_PREFIX, //!< rose prefix that is also run up to matches
 };
 
 /** \brief True if this kind of engine is triggered by a top event. */
@@ -63,8 +66,10 @@ bool is_triggered(enum nfa_kind k) {
 }
 
 /**
- * \brief True if this kind of engine generates callback events when it
- * enters accept states.
+ * \brief True if this kind of engine generates actively checks for accept
+ * states either to halt matching or to raise a callback. Only these engines
+ * generated with this property should call nfaQueueExec() or
+ * nfaQueueExecToMatch().
  */
 inline
 bool generates_callbacks(enum nfa_kind k) {
@@ -73,6 +78,24 @@ bool generates_callbacks(enum nfa_kind k) {
     case NFA_OUTFIX:
     case NFA_OUTFIX_RAW:
     case NFA_REV_PREFIX:
+    case NFA_EAGER_PREFIX:
+        return true;
+    default:
+        return false;
+    }
+}
+
+/**
+ * \brief True if this kind of engine has its state inspected to see if it is in
+ * an accept state. Engines generated with this property will commonly call
+ * nfaQueueExecRose(), nfaInAcceptState(), and nfaInAnyAcceptState().
+ */
+inline
+bool inspects_states_for_accepts(enum nfa_kind k) {
+    switch (k) {
+    case NFA_PREFIX:
+    case NFA_INFIX:
+    case NFA_EAGER_PREFIX:
         return true;
     default:
         return false;
@@ -94,6 +117,32 @@ bool has_managed_reports(enum nfa_kind k) {
     }
 }
 
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+
+inline
+std::string to_string(nfa_kind k) {
+    switch (k) {
+    case NFA_PREFIX:
+        return "PREFIX";
+    case NFA_INFIX:
+        return "INFIX";
+    case NFA_SUFFIX:
+        return "SUFFIX";
+    case NFA_OUTFIX:
+        return "OUTFIX";
+    case NFA_REV_PREFIX:
+        return "REV_PREFIX";
+    case NFA_OUTFIX_RAW:
+        return "OUTFIX_RAW";
+    case NFA_EAGER_PREFIX:
+        return "EAGER_PREFIX";
+    }
+    assert(0);
+    return "?";
+}
+
+#endif
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
new file mode 100644
index 00000000..bbbf1f20
--- /dev/null
+++ b/src/nfa/sheng.c
@@ -0,0 +1,676 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sheng.h"
+
+#include "accel.h"
+#include "sheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/join.h"
+#include "util/simd_utils.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct sheng *get_sheng(const struct NFA *n) {
+    return (const struct sheng *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux(const struct sheng *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel(const struct sheng *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl(const struct sheng *sh,
+                                 const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl(const struct sheng *sh,
+                                     const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char shengHasAccept(const struct sheng *sh, const struct sstate_aux *aux,
+                    ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireSingleReport(NfaCallback cb, void *ctxt, ReportID r, u64a loc) {
+    DEBUG_PRINTF("reporting %u\n", r);
+    if (cb(0, loc, r, ctxt) == MO_HALT_MATCHING) {
+        return MO_HALT_MATCHING; /* termination requested */
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 const u8 state, u64a loc, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl(sh, aux) : get_rl(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+/* include Sheng function definitions */
+#include "sheng_defs.h"
+
+static really_inline
+char runShengCb(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_coda(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_cod(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_cod(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_coa(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        } else {
+            rv = sheng4_co(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf, start,
+                           end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_co(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf, *scanned, end,
+                      scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runShengNm(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, start, end,
+                        scanned);
+        } else {
+            sheng4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, start, end,
+                       scanned);
+        }
+        sheng_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, start, end, scanned);
+        sheng_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                 single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runShengSam(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 u64a offset, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, const u8 *cur_buf,
+                 const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                 u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_samda(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf, start,
+                              end, scanned);
+        } else {
+            rv = sheng4_samd(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_samd(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, *scanned,
+                        end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_sama(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_sam(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_sam(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
+              enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng execution in state %u\n",
+                 state & SHENG_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports(sh, q->cb, q->context, state, q_cur_offset(q),
+                             &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            /* if we're in nomatch mode or if we're scanning history buffer */
+            if (mode == NO_MATCHES ||
+                (cur_start < 0 && mode == CALLBACK_OUTPUT)) {
+                runShengNm(sh, q->cb, q->context, q->offset,
+                           &cached_accept_state, &cached_accept_id, cur_buf,
+                           cur_buf + cur_start, cur_buf + cur_end, can_die,
+                           has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runShengCb(sh, q->cb, q->context, q->offset,
+                                &cached_accept_state, &cached_accept_id,
+                                cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runShengSam(sh, q->cb, q->context, q->offset,
+                                 &cached_accept_state, &cached_accept_id,
+                                 cur_buf, cur_buf + cur_start,
+                                 cur_buf + cur_end, can_die, has_accel, single,
+                                 &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG_STATE_MASK,
+                             new_state & SHENG_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                     size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng\n");
+    assert(n->type == SHENG_NFA_0);
+    const struct sheng *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runShengCb(sh, cb, context, offset, &cached_accept_state,
+                    &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                    has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports(sh, cb, context, state, end + offset, &cached_accept_state,
+                    &cached_accept_id, 1);
+    }
+
+    return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng0_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return shengHasAccept(sh, aux, report);
+}
+
+char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
+                           UNUSED const char *streamState, u64a offset,
+                           NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng *sh = get_sheng(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng *sh = (const struct sheng *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports(sh, cb, ctxt, s, offset, &cached_state_id,
+                        &cached_report_id, 1);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
+                                       void *state, UNUSED u8 key) {
+    const struct sheng *sh = get_sheng(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG_STATE_DEAD);
+}
+
+char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng *sh = get_sheng(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng0_queueCompressState(UNUSED const struct NFA *nfa,
+                                      const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng0_expandState(UNUSED const struct NFA *nfa, void *dest,
+                               const void *src, UNUSED u64a offset,
+                               UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
new file mode 100644
index 00000000..46ead180
--- /dev/null
+++ b/src/nfa/sheng.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_H_
+#define SHENG_H_
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+#define nfaExecSheng0_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng0_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng0_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecSheng0_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
+char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
+                                       void *state, u8 key);
+char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
+                           const char *streamState, u64a offset,
+                           NfaCallback callback, void *context);
+char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                    size_t length, NfaCallback cb, void *context);
+
+#endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
new file mode 100644
index 00000000..26bdbcee
--- /dev/null
+++ b/src/nfa/sheng_defs.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_DEFS_H
+#define SHENG_DEFS_H
+
+/*
+ * Utility functions used by various versions of Sheng engine
+ */
+static really_inline
+u8 isDeadState(const u8 a) {
+    return a & SHENG_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState(const u8 a) {
+    return a & SHENG_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState(const u8 a) {
+    return a & SHENG_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
+}
+
+/* these functions should be optimized out, used by NO_MATCHES mode */
+static really_inline
+u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
+              UNUSED const u8 d) {
+    return 0;
+}
+
+static really_inline
+u8 dummyFunc(UNUSED const u8 a) {
+    return 0;
+}
+
+/*
+ * Sheng function definitions for single byte loops
+ */
+/* callback output, can die */
+#define SHENG_IMPL sheng_cod
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die */
+#define SHENG_IMPL sheng_co
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die */
+#define SHENG_IMPL sheng_samd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die */
+#define SHENG_IMPL sheng_sam
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can die */
+#define SHENG_IMPL sheng_nmd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can't die */
+#define SHENG_IMPL sheng_nm
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/*
+ * Sheng function definitions for 4-byte loops
+ */
+/* callback output, can die, accelerated */
+#define SHENG_IMPL sheng4_coda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can die, not accelerated */
+#define SHENG_IMPL sheng4_cod
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, accelerated */
+#define SHENG_IMPL sheng4_coa
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, not accelerated */
+#define SHENG_IMPL sheng4_co
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, accelerated */
+#define SHENG_IMPL sheng4_samda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, not accelerated */
+#define SHENG_IMPL sheng4_samd
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, accelerated */
+#define SHENG_IMPL sheng4_sama
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, not accelerated */
+#define SHENG_IMPL sheng4_sam
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no-match have interesting func as dummy, and die/accel checks are outer */
+
+/* no match, can die, accelerated */
+#define SHENG_IMPL sheng4_nmda
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC isAccelState
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can die, not accelerated */
+#define SHENG_IMPL sheng4_nmd
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* there is no performance benefit in accelerating a no-match case that can't
+ * die */
+
+/* no match, can't die */
+#define SHENG_IMPL sheng4_nm
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+#endif // SHENG_DEFS_H
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
new file mode 100644
index 00000000..fc3e54aa
--- /dev/null
+++ b/src/nfa/sheng_impl.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL    (name of the Sheng implementation function)
+ *  - DEAD_FUNC     (name of the function checking for dead states)
+ *  - ACCEPT_FUNC   (name of the function checking for accept state)
+ *  - STOP_AT_MATCH (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* byte-by-byte version. we don't do byte-by-byte death checking as it's
+ * pretty pointless to do it over a buffer that's at most 3 bytes long */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m128 shuffle_mask = masks[c];
+        cur_state = pshufb(shuffle_mask, cur_state);
+        const u8 tmp = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", tmp, (tmp & 0xF0) >> 4,
+                     tmp & 0xF);
+
+        if (unlikely(ACCEPT_FUNC(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports(s, cb, ctxt, tmp, match_offset,
+                                cached_accept_state, cached_accept_id,
+                                0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
new file mode 100644
index 00000000..2561e52d
--- /dev/null
+++ b/src/nfa/sheng_impl4.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL        (name of the Sheng implementation function)
+ *  - INTERESTING_FUNC  (name of the function checking for accept, accel or dead
+ *                       states)
+ *  - INNER_DEAD_FUNC   (name of the inner function checking for dead states)
+ *  - OUTER_DEAD_FUNC   (name of the outer function checking for dead states)
+ *  - INNER_ACCEL_FUNC  (name of the inner function checking for accel states)
+ *  - OUTER_ACCEL_FUNC  (name of the outer function checking for accel states)
+ *  - ACCEPT_FUNC       (name of the function checking for accept state)
+ *  - STOP_AT_MATCH     (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* unrolled 4-byte-at-a-time version.
+ *
+ * we put innerDeadFunc inside interestingFunc() block so that we don't pay for
+ * dead states checking. however, if interestingFunc is dummy, innerDeadFunc
+ * gets lost with it, so we need an additional check outside the
+ * interestingFunc() branch - it's normally dummy so we don't pay for it, but
+ * when interestingFunc is dummy, outerDeadFunc should be set if we want to
+ * check for dead states.
+ *
+ * also, deadFunc only checks the last known state, but since we can't ever get
+ * out of the dead state and we don't really care where we died, it's not a
+ * problem.
+ */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC(*state) || OUTER_ACCEL_FUNC(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux = get_accel(s, *state & SHENG_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC(*state) || OUTER_DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m128 shuffle_mask1 = masks[c1];
+        cur_state = pshufb(shuffle_mask1, cur_state);
+        const u8 a1 = movd(cur_state);
+
+        const m128 shuffle_mask2 = masks[c2];
+        cur_state = pshufb(shuffle_mask2, cur_state);
+        const u8 a2 = movd(cur_state);
+
+        const m128 shuffle_mask3 = masks[c3];
+        cur_state = pshufb(shuffle_mask3, cur_state);
+        const u8 a3 = movd(cur_state);
+
+        const m128 shuffle_mask4 = masks[c4];
+        cur_state = pshufb(shuffle_mask4, cur_state);
+        const u8 a4 = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a1, (a1 & 0xF0) >> 4, a1 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a2, (a2 & 0xF0) >> 4, a2 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a3, (a3 & 0xF0) >> 4, a3 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a4, (a4 & 0xF0) >> 4, a4 & 0xF);
+
+        if (unlikely(INTERESTING_FUNC(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a1, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a2, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a3, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a4, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel(s, a4 & SHENG_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux = get_accel(s, a4 & SHENG_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
diff --git a/src/nfa/limex_simd512c.c b/src/nfa/sheng_internal.h
similarity index 65%
rename from src/nfa/limex_simd512c.c
rename to src/nfa/sheng_internal.h
index 0918fca5..046eb759 100644
--- a/src/nfa/limex_simd512c.c
+++ b/src/nfa/sheng_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,44 +26,45 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
- * \brief LimEx NFA: 512-bit SIMD runtime implementations.
- */
+#ifndef SHENG_INTERNAL_H_
+#define SHENG_INTERNAL_H_
 
-//#define DEBUG_INPUT
-//#define DEBUG_EXCEPTIONS
-
-#include "limex.h"
-
-#include "accel.h"
-#include "limex_internal.h"
-#include "nfa_internal.h"
 #include "ue2common.h"
-#include "util/bitutils.h"
 #include "util/simd_utils.h"
 
-// Common code
-#include "limex_runtime.h"
+#define SHENG_STATE_ACCEPT 0x10
+#define SHENG_STATE_DEAD 0x20
+#define SHENG_STATE_ACCEL 0x40
+#define SHENG_STATE_MASK 0xF
+#define SHENG_STATE_FLAG_MASK 0x70
 
-#define SIZE 512
-#define STATE_T m512
-#include "limex_exceptional.h"
+#define SHENG_FLAG_SINGLE_REPORT 0x1
+#define SHENG_FLAG_CAN_DIE 0x2
+#define SHENG_FLAG_HAS_ACCEL 0x4
 
-#define SIZE 512
-#define STATE_T m512
-#include "limex_state_impl.h"
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
 
-#define SIZE 512
-#define STATE_T m512
-#define INLINE_ATTR really_inline
-#include "limex_common_impl.h"
+struct sstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u32 accel;
+    u32 top;
+};
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               6
-#include "limex_runtime_impl.h"
+struct sheng {
+    m128 shuffle_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               7
-#include "limex_runtime_impl.h"
+#endif /* SHENG_INTERNAL_H_ */
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
new file mode 100644
index 00000000..911f6d70
--- /dev/null
+++ b/src/nfa/shengcompile.cpp
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "shengcompile.h"
+
+#include "accel.h"
+#include "accelcompile.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/container.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/unaligned.h"
+
+#include "grey.h"
+#include "nfa_internal.h"
+#include "sheng_internal.h"
+#include "ue2common.h"
+#include "util/compile_context.h"
+#include "util/make_unique.h"
+#include "util/verify_types.h"
+#include "util/simd_utils.h"
+
+#include <map>
+#include <vector>
+#include <sstream>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+
+/** Maximum tolerated number of escape character from an accel state.
+ * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
+ * characters for sets of states */
+#define ACCEL_DFA_MAX_STOP_CHAR 160
+
+/** Maximum tolerated number of escape character from a sds accel state. Larger
+ * than normal states as accelerating sds is important. Matches NFA value */
+#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
+
+struct dfa_info {
+    accel_dfa_build_strat &strat;
+    raw_dfa &raw;
+    vector<dstate> &states;
+    dstate &floating;
+    dstate &anchored;
+    bool can_die;
+
+    explicit dfa_info(accel_dfa_build_strat &s)
+        : strat(s), raw(strat.get_raw()), states(raw.states),
+          floating(states[raw.start_floating]),
+          anchored(states[raw.start_anchored]), can_die(dfaCanDie(raw)) {}
+
+    // returns adjusted size
+    size_t size() const {
+        return can_die ? states.size() : states.size() - 1;
+    }
+    // expects adjusted index
+    dstate &operator[](dstate_id_t idx) {
+        return states[raw_id(idx)];
+    }
+    dstate &top(dstate_id_t idx) {
+        if (isDead(idx)) {
+            return floating;
+        }
+        return next(idx, TOP);
+    }
+    dstate &next(dstate_id_t idx, u16 chr) {
+        auto &src = (*this)[idx];
+        auto next_id = src.next[raw.alpha_remap[chr]];
+        return states[next_id];
+    }
+    // get original idx from adjusted idx
+    dstate_id_t raw_id(dstate_id_t idx) {
+        assert(idx < size());
+        // if DFA can't die, shift all indices left by 1
+        return can_die ? idx : idx + 1;
+    }
+    bool isDead(dstate &state) {
+        return raw_id(state.impl_id) == DEAD_STATE;
+    }
+    bool isDead(dstate_id_t idx) {
+        return raw_id(idx) == DEAD_STATE;
+    }
+
+private:
+    static bool dfaCanDie(raw_dfa &rdfa) {
+        for (unsigned chr = 0; chr < 256; chr++) {
+            for (dstate_id_t state = 0; state < rdfa.states.size(); state++) {
+                auto succ = rdfa.states[state].next[rdfa.alpha_remap[chr]];
+                if (succ == DEAD_STATE) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+};
+
+namespace {
+
+struct raw_report_list {
+    flat_set<ReportID> reports;
+
+    raw_report_list(const flat_set<ReportID> &reports_in,
+                    const ReportManager &rm, bool do_remap) {
+        if (do_remap) {
+            for (auto &id : reports_in) {
+                reports.insert(rm.getProgramOffset(id));
+            }
+        } else {
+            reports = reports_in;
+        }
+    }
+
+    bool operator<(const raw_report_list &b) const {
+        return reports < b.reports;
+    }
+};
+
+struct raw_report_info_impl : public raw_report_info {
+    vector<raw_report_list> rl;
+    u32 getReportListSize() const override;
+    size_t size() const override;
+    void fillReportLists(NFA *n, size_t base_offset,
+                         std::vector<u32> &ro /* out */) const override;
+};
+}
+
+u32 raw_report_info_impl::getReportListSize() const {
+    u32 rv = 0;
+
+    for (const auto &reps : rl) {
+        rv += sizeof(report_list);
+        rv += sizeof(ReportID) * reps.reports.size();
+    }
+
+    return rv;
+}
+
+size_t raw_report_info_impl::size() const {
+    return rl.size();
+}
+
+void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
+                                           vector<u32> &ro) const {
+    for (const auto &reps : rl) {
+        ro.push_back(base_offset);
+
+        report_list *p = (report_list *)((char *)n + base_offset);
+
+        u32 i = 0;
+        for (const ReportID report : reps.reports) {
+            p->report[i++] = report;
+        }
+        p->count = verify_u32(reps.reports.size());
+
+        base_offset += sizeof(report_list);
+        base_offset += sizeof(ReportID) * reps.reports.size();
+    }
+}
+
+unique_ptr<raw_report_info> sheng_build_strat::gatherReports(
+                                                  vector<u32> &reports,
+                                                  vector<u32> &reports_eod,
+                                                  u8 *isSingleReport,
+                                                  ReportID *arbReport) const {
+    DEBUG_PRINTF("gathering reports\n");
+
+    const bool remap_reports = has_managed_reports(rdfa.kind);
+
+    auto ri = ue2::make_unique<raw_report_info_impl>();
+    map<raw_report_list, u32> rev;
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports.empty()) {
+            reports.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        raw_report_list rrl(s.reports, rm, remap_reports);
+        DEBUG_PRINTF("non empty r\n");
+        if (rev.find(rrl) != rev.end()) {
+            reports.push_back(rev[rrl]);
+        } else {
+            DEBUG_PRINTF("adding to rl %zu\n", ri->size());
+            rev[rrl] = ri->size();
+            reports.push_back(ri->size());
+            ri->rl.push_back(rrl);
+        }
+    }
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports_eod.empty()) {
+            reports_eod.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        DEBUG_PRINTF("non empty r eod\n");
+        raw_report_list rrl(s.reports_eod, rm, remap_reports);
+        if (rev.find(rrl) != rev.end()) {
+            reports_eod.push_back(rev[rrl]);
+            continue;
+        }
+
+        DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
+        rev[rrl] = ri->size();
+        reports_eod.push_back(ri->size());
+        ri->rl.push_back(rrl);
+    }
+
+    assert(!ri->rl.empty()); /* all components should be able to generate
+                                reports */
+    if (!ri->rl.empty()) {
+        *arbReport = *ri->rl.begin()->reports.begin();
+    } else {
+        *arbReport = 0;
+    }
+
+    /* if we have only a single report id generated from all accepts (not eod)
+     * we can take some short cuts */
+    set<ReportID> reps;
+
+    for (u32 rl_index : reports) {
+        if (rl_index == MO_INVALID_IDX) {
+            continue;
+        }
+        assert(rl_index < ri->size());
+        insert(&reps, ri->rl[rl_index].reports);
+    }
+
+    if (reps.size() == 1) {
+        *isSingleReport = 1;
+        *arbReport = *reps.begin();
+        DEBUG_PRINTF("single -- %u\n", *arbReport);
+    } else {
+        *isSingleReport = 0;
+    }
+
+    return move(ri);
+}
+
+u32 sheng_build_strat::max_allowed_offset_accel() const {
+    return ACCEL_DFA_MAX_OFFSET_DEPTH;
+}
+
+u32 sheng_build_strat::max_stop_char() const {
+    return ACCEL_DFA_MAX_STOP_CHAR;
+}
+
+u32 sheng_build_strat::max_floating_stop_char() const {
+    return ACCEL_DFA_MAX_FLOATING_STOP_CHAR;
+}
+
+size_t sheng_build_strat::accelSize() const {
+    return sizeof(AccelAux);
+}
+
+#ifdef DEBUG
+static really_inline
+void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
+#endif
+
+static
+void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                  set<dstate_id_t> *accel_states) {
+    for (dstate_id_t i : accel_escape_info | map_keys) {
+        accel_states->insert(i);
+    }
+}
+
+static
+u8 getShengState(dstate &state, dfa_info &info,
+                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG_STATE_DEAD;
+    }
+    if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) {
+        s |= SHENG_STATE_ACCEL;
+    }
+    return s;
+}
+
+static
+void fillAccelAux(struct NFA *n, dfa_info &info,
+                  map<dstate_id_t, AccelScheme> &accelInfo) {
+    DEBUG_PRINTF("Filling accel aux structures\n");
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 offset = s->accel_offset;
+
+    for (dstate_id_t i = 0; i < info.size(); i++) {
+        dstate_id_t state_id = info.raw_id(i);
+        if (accelInfo.find(state_id) != accelInfo.end()) {
+            s->flags |= SHENG_FLAG_HAS_ACCEL;
+            AccelAux *aux = (AccelAux *)((char *)n + offset);
+            info.strat.buildAccel(state_id, accelInfo[state_id], aux);
+            sstate_aux *saux =
+                (sstate_aux *)((char *)n + s->aux_offset) + state_id;
+            saux->accel = offset;
+            DEBUG_PRINTF("Accel offset: %u\n", offset);
+            offset += ROUNDUP_N(sizeof(AccelAux), alignof(AccelAux));
+        }
+    }
+}
+
+static
+void populateBasicInfo(struct NFA *n, dfa_info &info,
+                       map<dstate_id_t, AccelScheme> &accelInfo, u32 aux_offset,
+                       u32 report_offset, u32 accel_offset, u32 total_size,
+                       u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_0;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState(info.anchored, info, accelInfo);
+    s->floating = getShengState(info.floating, info, accelInfo);
+}
+
+static
+void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
+              map<dstate_id_t, AccelScheme> &accelInfo) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 aux_base = s->aux_offset;
+
+    DEBUG_PRINTF("Filling tops for state %u\n", id);
+
+    sstate_aux *aux = (sstate_aux *)((char *)n + aux_base) + id;
+
+    DEBUG_PRINTF("Aux structure for state %u, offset %zd\n", id,
+                 (char *)aux - (char *)n);
+
+    /* we could conceivably end up in an accept/dead state on a top event,
+     * so mark top as accept/dead state if it indeed is.
+     */
+    auto &top_state = info.top(id);
+
+    DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id);
+
+    aux->top = getShengState(top_state, info, accelInfo);
+}
+
+static
+void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
+                 vector<u32> &reports_eod, vector<u32> &report_offsets) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 aux_base = s->aux_offset;
+    auto raw_id = info.raw_id(id);
+
+    auto &state = info[id];
+
+    sstate_aux *aux = (sstate_aux *)((char *)n + aux_base) + id;
+
+    DEBUG_PRINTF("Filling aux and report structures for state %u\n", id);
+    DEBUG_PRINTF("Aux structure for state %u, offset %zd\n", id,
+                 (char *)aux - (char *)n);
+
+    aux->accept = state.reports.empty() ? 0 : report_offsets[reports[raw_id]];
+    aux->accept_eod =
+        state.reports_eod.empty() ? 0 : report_offsets[reports_eod[raw_id]];
+
+    DEBUG_PRINTF("Report list offset: %u\n", aux->accept);
+    DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod);
+}
+
+static
+void fillSingleReport(NFA *n, ReportID r_id) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+
+    DEBUG_PRINTF("Single report ID: %u\n", r_id);
+    s->report = r_id;
+    s->flags |= SHENG_FLAG_SINGLE_REPORT;
+}
+
+static
+void createShuffleMasks(sheng *s, dfa_info &info,
+                        map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[16] = {0};
+
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
+
+            buf[idx] = getShengState(succ_state, info, accelInfo);
+        }
+#ifdef DEBUG
+        dumpShuffleMask(chr, buf, sizeof(buf));
+#endif
+        m128 mask = loadu128(buf);
+        s->shuffle_masks[chr] = mask;
+    }
+}
+
+bool has_accel_sheng(const NFA *nfa) {
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+    return s->flags & SHENG_FLAG_HAS_ACCEL;
+}
+
+aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
+                                     const CompileContext &cc,
+                                     const ReportManager &rm,
+                                     set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    if (info.size() > 16) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+    auto accelInfo = strat.getAccelInfo(cc.grey);
+
+    // set impl_id of each dfa state
+    for (dstate_id_t i = 0; i < info.size(); i++) {
+        info[i].impl_id = i;
+    }
+
+    DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n",
+                 info.anchored.impl_id, info.floating.impl_id);
+
+    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng));
+    vector<u32> reports, eod_reports, report_offsets;
+    u8 isSingle = 0;
+    ReportID single_report = 0;
+
+    auto ri =
+        strat.gatherReports(reports, eod_reports, &isSingle, &single_report);
+
+    u32 total_aux = sizeof(sstate_aux) * info.size();
+    u32 total_accel = strat.accelSize() * accelInfo.size();
+    u32 total_reports = ri->getReportListSize();
+
+    u32 reports_offset = nfa_size + total_aux;
+    u32 accel_offset =
+        ROUNDUP_N(reports_offset + total_reports, alignof(AccelAux));
+    u32 total_size = ROUNDUP_N(accel_offset + total_accel, 64);
+
+    DEBUG_PRINTF("NFA: %u, aux: %u, reports: %u, accel: %u, total: %u\n",
+                 nfa_size, total_aux, total_reports, total_accel, total_size);
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+
+    populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
+                      accel_offset, total_size, total_size - sizeof(NFA));
+
+    DEBUG_PRINTF("Setting up aux and report structures\n");
+
+    ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
+
+    for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+        fillTops(nfa.get(), info, idx, accelInfo);
+        fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets);
+    }
+    if (isSingle) {
+        fillSingleReport(nfa.get(), single_report);
+    }
+
+    fillAccelAux(nfa.get(), info, accelInfo);
+
+    if (accel_states) {
+        fillAccelOut(accelInfo, accel_states);
+    }
+
+    createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo);
+
+    return nfa;
+}
+
+} // namespace ue2
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
new file mode 100644
index 00000000..873b7c75
--- /dev/null
+++ b/src/nfa/shengcompile.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENGCOMPILE_H_
+#define SHENGCOMPILE_H_
+
+#include "accel_dfa_build_strat.h"
+#include "rdfa.h"
+#include "util/alloc.h"
+#include "util/charreach.h"
+#include "util/ue2_containers.h"
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+struct CompileContext;
+struct raw_dfa;
+
+class sheng_build_strat : public accel_dfa_build_strat {
+public:
+    sheng_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
+        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+    raw_dfa &get_raw() const override { return rdfa; }
+    std::unique_ptr<raw_report_info> gatherReports(
+                                  std::vector<u32> &reports /* out */,
+                                  std::vector<u32> &reports_eod /* out */,
+                                  u8 *isSingleReport /* out */,
+                                  ReportID *arbReport /* out */) const override;
+    size_t accelSize(void) const override;
+    u32 max_allowed_offset_accel() const override;
+    u32 max_stop_char() const override;
+    u32 max_floating_stop_char() const override;
+
+private:
+    raw_dfa &rdfa;
+};
+
+aligned_unique_ptr<NFA>
+shengCompile(raw_dfa &raw, const CompileContext &cc, const ReportManager &rm,
+             std::set<dstate_id_t> *accel_states = nullptr);
+
+struct sheng_escape_info {
+    CharReach outs;
+    CharReach outs2_single;
+    flat_set<std::pair<u8, u8>> outs2;
+    bool outs2_broken = false;
+};
+
+bool has_accel_sheng(const NFA *nfa);
+
+} // namespace ue2
+
+#endif /* SHENGCOMPILE_H_ */
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
new file mode 100644
index 00000000..037dfb05
--- /dev/null
+++ b/src/nfa/shengdump.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "shengdump.h"
+
+#include "accel_dump.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+#include "sheng_internal.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/charreach.h"
+#include "util/dump_charclass.h"
+#include "util/simd_utils.h"
+
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using namespace std;
+
+namespace ue2 {
+
+static
+const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
+    assert(n && isShengType(n->type));
+
+    const sheng *s = (const sheng *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
+static
+void dumpHeader(FILE *f, const sheng *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG_STATE_MASK, s->floating & SHENG_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
+static
+void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG_STATE_MASK);
+}
+
+static
+void dumpReports(FILE *f, const report_list *rl) {
+    fprintf(f, "reports count: %u\n", rl->count);
+    for (u32 i = 0; i < rl->count; i++) {
+        fprintf(f, "  report: %u, report ID: %u\n", i, rl->report[i]);
+    }
+}
+
+static
+void dumpMasks(FILE *f, const sheng *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[16];
+        m128 shuffle_mask = s->shuffle_masks[chr];
+        store128(buf, shuffle_mask);
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 16; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
+void nfaExecSheng0_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_0);
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+
+    fprintf(f, "sheng DFA\n");
+    dumpHeader(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux(nfa, state);
+        dumpAux(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks(f, s);
+
+    fprintf(f, "\n");
+}
+
+static
+void dumpDotPreambleDfa(FILE *f) {
+    dumpDotPreamble(f);
+
+    // DFA specific additions.
+    fprintf(f, "STARTF [style=invis];\n");
+    fprintf(f, "STARTA [style=invis];\n");
+    fprintf(f, "0 [style=invis];\n");
+}
+
+static
+void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
+static
+void describeEdge(FILE *f, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ label = \"", i, t[s]);
+
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
+static
+void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isShengType(n->type));
+    const sheng *s = (const sheng *)getImplNfa(n);
+    const sstate_aux *aux = get_aux(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[16];
+        m128 shuffle_mask = s->shuffle_masks[i];
+
+        store128(buf, shuffle_mask);
+
+        t[i] = buf[state] & SHENG_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG_STATE_MASK;
+}
+
+void nfaExecSheng0_dumpDot(const NFA *nfa, FILE *f, const string &) {
+    assert(nfa->type == SHENG_NFA_0);
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        shengGetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+} // namespace ue2
diff --git a/src/util/simd_utils_ssse3.c b/src/nfa/shengdump.h
similarity index 79%
rename from src/util/simd_utils_ssse3.c
rename to src/nfa/shengdump.h
index 50cbe007..5334894f 100644
--- a/src/util/simd_utils_ssse3.c
+++ b/src/nfa/shengdump.h
@@ -26,15 +26,24 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "simd_utils_ssse3.h"
+#ifndef SHENGDUMP_H_
+#define SHENGDUMP_H_
 
-const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+#ifdef DUMP_SUPPORT
 
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+#include <cstdio>
+#include <string>
 
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-};
+struct NFA;
+
+namespace ue2 {
+
+void nfaExecSheng0_dumpDot(const struct NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecSheng0_dumpText(const struct NFA *nfa, FILE *file);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif /* SHENGDUMP_H_ */
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index b1fec488..903e04da 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,8 +40,6 @@
 
 #include "shufti_common.h"
 
-#include "util/simd_utils_ssse3.h"
-
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
 const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
@@ -235,7 +233,7 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
 
     m128 c2_lo  = pshufb(mask2_lo, chars_lo);
     m128 c2_hi  = pshufb(mask2_hi, chars_hi);
-    m128 t2     = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi)));
+    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
     DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
@@ -472,7 +470,7 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
 
     m256 c2_lo  = vpshufb(mask2_lo, chars_lo);
     m256 c2_hi  = vpshufb(mask2_hi, chars_hi);
-    m256 t2     = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi)));
+    m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
     DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h
index 9c11f2b9..e63ad27a 100644
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
-#include "util/simd_utils_ssse3.h"
 
 /*
  * Common stuff for all versions of shufti (single, multi and multidouble)
@@ -94,7 +93,7 @@ DUMP_MSK(128)
 #endif
 
 #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
 static really_inline
 u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
@@ -120,7 +119,7 @@ DUMP_MSK(256)
 #endif
 
 #define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 
 static really_inline
 u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
new file mode 100644
index 00000000..b5f90e85
--- /dev/null
+++ b/src/nfa/tamarama.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Tamarama: container engine for exclusive engines, runtime code.
+*/
+#include "config.h"
+
+#include "tamarama.h"
+
+#include "tamarama_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_api_util.h"
+#include "nfa_internal.h"
+#include "scratch.h"
+#include "util/partial_store.h"
+
+static really_inline
+u32 getSubOffset(const struct Tamarama *t, u32 num) {
+    DEBUG_PRINTF("subengine:%u\n", num);
+    assert(num < t->numSubEngines);
+    const u32 *sub =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    assert(ISALIGNED(sub));
+    return sub[num];
+}
+
+static
+const struct NFA *getSubEngine(const struct Tamarama *t,
+                               const u32 activeIdx) {
+    const u32 offset = getSubOffset(t, activeIdx);
+    DEBUG_PRINTF("activeIdx:%u offsets:%u\n", activeIdx, offset);
+    const char *base = (const char *)t;
+    return (const struct NFA *)(base + offset);
+}
+
+static
+void storeActiveIdx(const struct Tamarama *t, char *state,
+                    const u32 idx) {
+    assert(idx <= t->numSubEngines);
+    partial_store_u32(state, idx, t->activeIdxSize);
+}
+
+static
+u32 loadActiveIdx(const char *state,
+                  const u32 activeIdxSize) {
+    return partial_load_u32(state, activeIdxSize);
+}
+
+static really_inline
+void copyQueueProperties(const struct mq *q1, struct mq *q2,
+                         const u32 activeIdxSize) {
+    q2->state = q1->state;
+    q2->streamState = q1->streamState + activeIdxSize;
+    q2->offset = q1->offset;
+    q2->buffer = q1->buffer;
+    q2->length = q1->length;
+    q2->history = q1->history;
+    q2->hlength = q1->hlength;
+    q2->cb = q1->cb;
+    q2->context = q1->context;
+    q2->scratch = q1->scratch;
+    q2->report_current = q1->report_current;
+}
+
+static
+void copyQueueItems(const struct Tamarama *t, const struct NFA *sub,
+                    struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                       sizeof(struct Tamarama));
+
+    u32 lower = baseTop[activeIdx];
+    u32 upper = activeIdx == t->numSubEngines - 1 ?
+                    ~0U : baseTop[activeIdx + 1];
+    u32 event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    while (q1->cur < q1->end) {
+        u32 type = q1->items[q1->cur].type;
+        s64a loc = q1->items[q1->cur].location;
+        DEBUG_PRINTF("type:%u lower:%u upper:%u\n", type, lower, upper);
+        if (type >= lower && type < upper) {
+            u32 event = event_base;
+            if (event == MQE_TOP_FIRST) {
+                event += type - lower;
+            }
+            pushQueue(q2, event, loc);
+        } else {
+            pushQueueNoMerge(q2, MQE_END, loc);
+            break;
+        }
+        q1->cur++;
+    }
+}
+
+static
+void copyQueue(const struct Tamarama *t, const struct NFA *sub,
+               struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    // copy MQE_START item
+    u32 cur = q1->cur++;
+    q2->cur = cur;
+    q2->items[cur] = q1->items[cur];
+    q2->end = cur + 1;
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    // restore cur index of the main queue
+    q1->cur = cur;
+}
+
+static
+u32 findEngineForTop(const u32 *baseTop, const u32 cur,
+                     const u32 numSubEngines) {
+    u32 i;
+    for (i = 0; i < numSubEngines; ++i) {
+        DEBUG_PRINTF("cur:%u base:%u\n", cur, baseTop[i]);
+        if (cur >= baseTop[i] &&
+            (i == numSubEngines - 1 || cur < baseTop[i + 1])) {
+            break;
+        }
+    }
+    return i;
+}
+
+static
+void initSubQueue(const struct Tamarama *t, struct mq *q1,
+                  struct mq *q2, const u32 lastActiveIdx,
+                  const u32 activeIdx) {
+    // Push events to the new queue
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    assert(!isContainerType(sub->type));
+    q2->nfa = sub;
+
+    // Reinitialize state if the last active subengine is different
+    // from current one
+    if (lastActiveIdx == t->numSubEngines ||
+        lastActiveIdx != activeIdx) {
+        nfaQueueInitState(q2->nfa, q2);
+    }
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    if (q1->items[q1->cur].type == MQE_END) {
+        q1->cur++;
+    }
+    DEBUG_PRINTF("update lastIdx:%u\n", activeIdx);
+    storeActiveIdx(t, q1->streamState, activeIdx);
+}
+
+static
+void updateQueues(const struct Tamarama *t, struct mq *q1, struct mq *q2) {
+    q2->cur = q2->end = 0;
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    const u32 numSubEngines = t->numSubEngines;
+    u32 lastActiveIdx = loadActiveIdx(q1->streamState,
+                                      t->activeIdxSize);
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q1);
+#endif
+
+    // Push MQE_START event to the subqueue
+    s64a loc = q1->items[q1->cur].location;
+    pushQueueAt(q2, 0, MQE_START, loc);
+    char hasStart = 0;
+    if (q1->items[q1->cur].type == MQE_START) {
+        hasStart = 1;
+        q1->cur++;
+    }
+
+    u32 activeIdx = lastActiveIdx;
+    // If we have top events in the main queue, update current active id
+    if (q1->cur < q1->end - 1) {
+        const u32 *baseTop = (const u32 *)((const char *)t +
+                                           sizeof(struct Tamarama));
+        u32 curTop = q1->items[q1->cur].type;
+        activeIdx = findEngineForTop(baseTop, curTop, numSubEngines);
+    }
+
+    assert(activeIdx < numSubEngines);
+    DEBUG_PRINTF("last id:%u, current id:%u, num of subengines:%u\n",
+                 lastActiveIdx, activeIdx, numSubEngines);
+    // Handle unfinished last alive subengine
+    if (lastActiveIdx != activeIdx &&
+        lastActiveIdx != numSubEngines && hasStart) {
+        loc = q1->items[q1->cur].location;
+        pushQueueNoMerge(q2, MQE_END, loc);
+        q2->nfa = getSubEngine(t, lastActiveIdx);
+        return;
+    }
+
+    initSubQueue(t, q1, q2, lastActiveIdx, activeIdx);
+    DEBUG_PRINTF("finish queues\n");
+}
+
+// After processing subqueue items for subengines, we need to copy back
+// remaining items in subqueue if there are any to Tamarama main queue
+static
+void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
+    DEBUG_PRINTF("copy back %u, %u\n", q1->cur, q1->end);
+    q->report_current = q1->report_current;
+    if (q->cur >= q->end && q1->cur >= q1->end) {
+        return;
+    }
+
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                        sizeof(struct Tamarama));
+    const u32 lastIdx = loadActiveIdx(q->streamState,
+                                      t->activeIdxSize);
+    u32 base = 0, event_base = 0;
+    if (lastIdx != t->numSubEngines) {
+        base = baseTop[lastIdx];
+        const struct NFA *sub = getSubEngine(t, lastIdx);
+        event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    }
+
+    u32 numItems = q1->end > q1->cur + 1 ? q1->end - q1->cur - 1 : 1;
+    // Also need to copy MQE_END if the main queue is empty
+    if (q->cur == q->end) {
+        assert(q->cur > 1 && q1->items[q1->end - 1].type == MQE_END);
+        q->items[--q->cur] = q1->items[q1->end - 1];
+    }
+    u32 cur = q->cur - numItems;
+    q->items[cur] = q1->items[q1->cur++];
+    q->items[cur].type = MQE_START;
+    q->cur = cur++;
+    for (u32 i = 0; i < numItems - 1; ++i) {
+        assert(q1->cur < q1->end);
+        u32 type = q1->items[q1->cur].type;
+        if (type > MQE_END) {
+            q1->items[q1->cur].type = type - event_base + base;
+        }
+        q->items[cur++] = q1->items[q1->cur++];
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q);
+#endif
+}
+
+char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, void *context) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    if (nfaAcceptsEod(sub)) {
+        assert(!isContainerType(sub->type));
+        const char *subStreamState = streamState + t->activeIdxSize;
+        return nfaCheckFinalState(sub, state, subStreamState, offset, callback,
+                                  context);
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q,
+                         ReportID report) {
+    DEBUG_PRINTF("exec rose\n");
+    struct mq q1;
+    q1.cur = q1.end = 0;
+    char rv = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end) {
+        updateQueues(t, q, &q1);
+    }
+
+    if (q1.cur < q1.end) {
+        rv = nfaQueueExecRose(q1.nfa, &q1, report);
+    }
+
+    DEBUG_PRINTF("exec rose rv:%u\n", rv);
+    return rv;
+}
+
+char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 1;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaReportCurrentMatches(sub, &q1);
+}
+
+char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAcceptState(sub, report, &q1);
+}
+
+char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAnyAcceptState(sub, &q1);
+}
+
+char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q) {
+    DEBUG_PRINTF("init state\n");
+    const struct Tamarama *t = getImplNfa(n);
+    char *ptr = q->streamState;
+    // Use activeIdxSize as a sentinel value and initialize the state to
+    // an invalid engine as nothing has been triggered yet
+    storeActiveIdx(t, ptr, t->numSubEngines);
+    return 0;
+}
+
+char nfaExecTamarama0_queueCompressState(const struct NFA *n,
+                                         const struct mq *q, s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueueProperties(q, &q1, t->activeIdxSize);
+    return nfaQueueCompressState(sub, &q1, loc);
+}
+
+char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
+                                  const void *src, u64a offset, u8 key) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(src, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    const char *subStreamState = (const char *)src + t->activeIdxSize;
+    return nfaExpandState(sub, dest, subStreamState, offset, key);
+}
+
+enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
+                                                      struct mq *q, s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return NFA_ZOMBIE_NO;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaGetZombieStatus(sub, &q1, loc);
+}
+
+char nfaExecTamarama0_Q(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec\n");
+    struct mq q1;
+    char rv = MO_ALIVE;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
+char nfaExecTamarama0_Q2(const struct NFA *n,
+                         struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec to match\n");
+    struct mq q1;
+    char rv = 0;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end &&
+           rv != MO_MATCHES_PENDING) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec2_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
diff --git a/src/nfa/tamarama.h b/src/nfa/tamarama.h
new file mode 100644
index 00000000..7ccfa5a0
--- /dev/null
+++ b/src/nfa/tamarama.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TAMARAMA_H
+#define TAMARAMA_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+struct hs_scratch;
+
+char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, void *context);
+char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_queueCompressState(const struct NFA *n,
+                                         const struct mq *q,
+                                         s64a loc);
+char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
+                                  const void *src, u64a offset, u8 key);
+enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
+                                                      struct mq *q, s64a loc);
+char nfaExecTamarama0_Q(const struct NFA *nfa, struct mq *q, s64a end);
+char nfaExecTamarama0_Q2(const struct NFA *nfa, struct mq *q, s64a end);
+
+// only used by outfix and miracles, no implementation for tamarama
+#define nfaExecTamarama0_initCompressedState NFA_API_NO_IMPL
+#define nfaExecTamarama0_B_Reverse NFA_API_NO_IMPL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/nfa/tamarama_dump.cpp b/src/nfa/tamarama_dump.cpp
new file mode 100644
index 00000000..181fa9af
--- /dev/null
+++ b/src/nfa/tamarama_dump.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Tamarama: container engine for exclusve engines, dump code.
+ */
+
+#include "config.h"
+
+#include "tamarama_dump.h"
+
+#include "tamarama_internal.h"
+#include "nfa_dump_api.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+
+#include <string>
+#include <sstream>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+namespace ue2 {
+
+void nfaExecTamarama0_dumpDot(const struct NFA *nfa, UNUSED FILE *f,
+                              const std::string &base) {
+    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
+    const u32 *subOffset =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    for (u32 i = 0; i < t->numSubEngines; i++) {
+        std::stringstream ssdot;
+        ssdot << base << "rose_nfa_" << nfa->queueIndex
+            << "_sub_" << i << ".dot";
+        const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
+        FILE *f1 = fopen(ssdot.str().c_str(), "w");
+        nfaDumpDot(sub, f1, base);
+        fclose(f1);
+    }
+}
+
+void nfaExecTamarama0_dumpText(const struct NFA *nfa, FILE *f) {
+    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
+
+    fprintf(f, "Tamarama container engine\n");
+    fprintf(f, "\n");
+    fprintf(f, "Number of subengine tenants:  %u\n", t->numSubEngines);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+    fprintf(f, "\n");
+
+    const u32 *subOffset =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    for (u32 i = 0; i < t->numSubEngines; i++) {
+        fprintf(f, "Sub %u:\n", i);
+        const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
+        nfaDumpText(sub, f);
+        fprintf(f, "\n");
+    }
+}
+
+} // namespace ue2
diff --git a/src/nfa/mcclellancompile_accel.h b/src/nfa/tamarama_dump.h
similarity index 61%
rename from src/nfa/mcclellancompile_accel.h
rename to src/nfa/tamarama_dump.h
index 427267d7..dc976004 100644
--- a/src/nfa/mcclellancompile_accel.h
+++ b/src/nfa/tamarama_dump.h
@@ -26,36 +26,24 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef MCCLELLANCOMPILE_ACCEL_H
-#define MCCLELLANCOMPILE_ACCEL_H
+#ifndef TAMARAMA_DUMP_H
+#define TAMARAMA_DUMP_H
 
-#include "mcclellancompile.h"
+#if defined(DUMP_SUPPORT)
 
-#include <map>
+#include <cstdio>
+#include <string>
+
+struct NFA;
 
 namespace ue2 {
 
-struct Grey;
+void nfaExecTamarama0_dumpDot(const NFA *nfa, FILE *file,
+                              const std::string &base);
+void nfaExecTamarama0_dumpText(const NFA *nfa, FILE *file);
 
-#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+} // namespace ue2
 
-/** Maximum tolerated number of escape character from an accel state.
- * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
- * characters for sets of states */
-#define ACCEL_DFA_MAX_STOP_CHAR 160
-
-/** Maximum tolerated number of escape character from a sds accel state. Larger
- * than normal states as accelerating sds is important. Matches NFA value */
-#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
-
-std::map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
-                                                   const dfa_build_strat &strat,
-                                                   const Grey &grey);
-
-AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa,
-                                       dstate_id_t this_idx,
-                                       u32 max_allowed_accel_offset);
-
-}
+#endif // DUMP_SUPPORT
 
 #endif
diff --git a/src/nfa/tamarama_internal.h b/src/nfa/tamarama_internal.h
new file mode 100644
index 00000000..5cdc70d4
--- /dev/null
+++ b/src/nfa/tamarama_internal.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ *\brief Tamarama: container engine for exclusive engines,
+ *                 data structures.
+ */
+
+/* Tamarama bytecode layout:
+ * * |-----|
+ * * |     | struct NFA
+ * * |-----|
+ * * |     | struct Tamarama
+ * * |     |
+ * * |-----|
+ * * |     | top remapping table:
+ * * |     | stores top base for each subengine.
+ * * |     | old_top = remapped_top - top_base;
+ * * |     | The size of table is equal to the number of subengines.
+ * * ...
+ * * |     |
+ * * |-----|
+ * * |     | offsets from the start of struct Tamarama to subengines --\
+ * * ...                                                               |
+ * * |     |                                          -----------\     |
+ * * |-----|                                                     |     |
+ * * ||--| | subengine 1 (struct NFA + rest of subengine)     <--/     |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * ||  | |                                                           |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * |     |                                                           |
+ * * ||--| | subengine 2 (struct NFA + rest of subengine)      <-------/
+ * * ||  | |
+ * * ||--| |
+ * * ||  | |
+ * * ||  | |
+ * * ||--| |
+ * * |     |
+ * * ...
+ * * |     |
+ * * |-----| total size of tamarama
+ * *
+ * * Tamarama stream state:
+ * *
+ * * |---|
+ * * |   | active subengine id
+ * * |---|
+ * * |   | common pool of stream state for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * *
+ * * Tamarama scratch space:
+ * *
+ * * |---|
+ * * |   | common pool of scratch for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * */
+
+#ifndef NFA_TAMARAMA_INTERNAL_H
+#define NFA_TAMARAMA_INTERNAL_H
+
+#include "ue2common.h"
+
+struct ALIGN_AVX_DIRECTIVE Tamarama {
+    u32 numSubEngines;
+    u8 activeIdxSize;
+};
+
+#endif // NFA_TAMARAMA_INTERNAL_H
diff --git a/src/nfa/tamaramacompile.cpp b/src/nfa/tamaramacompile.cpp
new file mode 100644
index 00000000..73d19595
--- /dev/null
+++ b/src/nfa/tamaramacompile.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Tamarama: container engine for exclusive engines,
+ *                  compiler code.
+ */
+
+#include "config.h"
+
+#include "tamaramacompile.h"
+
+#include "tamarama_internal.h"
+#include "nfa_internal.h"
+#include "nfa_api_queue.h"
+#include "repeatcompile.h"
+#include "util/container.h"
+#include "util/verify_types.h"
+
+using namespace std;
+
+namespace ue2 {
+
+static
+void remapTops(const TamaInfo &tamaInfo,
+               vector<u32> &top_base,
+               map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    u32 i = 0;
+    u32 cur = 0;
+    for (const auto &sub : tamaInfo.subengines) {
+        u32 base = cur;
+        top_base.push_back(base + MQE_TOP_FIRST);
+        DEBUG_PRINTF("subengine:%u\n", i);
+        for (const auto &t : tamaInfo.tops[i++]) {
+            cur = base + t;
+            DEBUG_PRINTF("top remapping %u:%u\n", t ,cur);
+            out_top_remap.emplace(make_pair(sub, t), cur++);
+        }
+    }
+}
+
+/**
+ * update stream state and scratch state sizes and copy in
+ * subengines in Tamarama.
+ */
+static
+void copyInSubnfas(const char *base_offset, NFA &nfa,
+                   const TamaInfo &tamaInfo, u32 *offsets,
+                   char *sub_nfa_offset, const u32 activeIdxSize) {
+    u32 maxStreamStateSize = 0;
+    u32 maxScratchStateSize = 0;
+    sub_nfa_offset = ROUNDUP_PTR(sub_nfa_offset, 64);
+    bool infinite_max_width = false;
+    for (auto &sub : tamaInfo.subengines) {
+        u32 streamStateSize = verify_u32(sub->streamStateSize);
+        u32 scratchStateSize = verify_u32(sub->scratchStateSize);
+        maxStreamStateSize = max(maxStreamStateSize, streamStateSize);
+        maxScratchStateSize = max(maxScratchStateSize, scratchStateSize);
+        sub->queueIndex = nfa.queueIndex;
+
+        memcpy(sub_nfa_offset, sub, sub->length);
+        *offsets = verify_u32(sub_nfa_offset - base_offset);
+        DEBUG_PRINTF("type:%u offsets:%u\n", sub->type, *offsets);
+        ++offsets;
+        sub_nfa_offset += ROUNDUP_CL(sub->length);
+
+        // update nfa properties
+        nfa.flags |= sub->flags;
+        if (!sub->maxWidth) {
+            infinite_max_width = true;
+        } else if (!infinite_max_width) {
+            nfa.maxWidth = max(nfa.maxWidth, sub->maxWidth);
+        }
+    }
+
+    if (infinite_max_width) {
+        nfa.maxWidth = 0;
+    }
+    nfa.maxBiAnchoredWidth = 0;
+    nfa.streamStateSize = activeIdxSize + maxStreamStateSize;
+    nfa.scratchStateSize = maxScratchStateSize;
+}
+
+/**
+ * Take in a collection of exclusive sub engines and produces a tamarama, also
+ * returns via out_top_remap, a mapping indicating how tops in the subengines in
+ * relate to the tamarama's tops.
+ */
+aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
+                        map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    vector<u32> top_base;
+    remapTops(tamaInfo, top_base, out_top_remap);
+
+    size_t subSize = tamaInfo.subengines.size();
+    DEBUG_PRINTF("subSize:%lu\n", subSize);
+    size_t total_size =
+        sizeof(NFA) +               // initial NFA structure
+        sizeof(Tamarama) +          // Tamarama structure
+        sizeof(u32) * subSize +     // base top event value for subengines,
+                                    // used for top remapping at runtime
+        sizeof(u32) * subSize + 64; // offsets to subengines in bytecode and
+                                    // padding for subengines
+
+    for (const auto &sub : tamaInfo.subengines) {
+        total_size += ROUNDUP_CL(sub->length);
+    }
+
+    // use subSize as a sentinel value for no active subengines,
+    // so add one to subSize here
+    u32 activeIdxSize = calcPackedBytes(subSize + 1);
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    nfa->type = verify_u8(TAMARAMA_NFA_0);
+    nfa->length = verify_u32(total_size);
+    nfa->queueIndex = queue;
+
+    char *ptr = (char *)nfa.get() + sizeof(NFA);
+    char *base_offset = ptr;
+    Tamarama *t = (Tamarama *)ptr;
+    t->numSubEngines = verify_u32(subSize);
+    t->activeIdxSize = verify_u8(activeIdxSize);
+
+    ptr += sizeof(Tamarama);
+    copy_bytes(ptr, top_base);
+    ptr += byte_length(top_base);
+
+    u32 *offsets = (u32*)ptr;
+    char *sub_nfa_offset = ptr + sizeof(u32) * subSize;
+    copyInSubnfas(base_offset, *nfa, tamaInfo, offsets, sub_nfa_offset,
+                  activeIdxSize);
+    assert((size_t)(sub_nfa_offset - (char *)nfa.get()) <= total_size);
+    return nfa;
+}
+
+set<ReportID> all_reports(const TamaProto &proto) {
+    return proto.reports;
+}
+
+void TamaInfo::add(NFA *sub, const set<u32> &top) {
+    assert(subengines.size() < max_occupancy);
+    subengines.push_back(sub);
+    tops.push_back(top);
+}
+
+void TamaProto::add(const NFA *n, const u32 id, const u32 top,
+                    const map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    top_remap.emplace(make_pair(id, top), out_top_remap.at(make_pair(n, top)));
+}
+
+} // namespace ue2
+
diff --git a/src/nfa/tamaramacompile.h b/src/nfa/tamaramacompile.h
new file mode 100644
index 00000000..048b966b
--- /dev/null
+++ b/src/nfa/tamaramacompile.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ *  \brief Tamarama: container engine for exclusive engines, compiler code.
+ */
+
+#ifndef NFA_TAMARAMACOMPILE_H
+#define NFA_TAMARAMACOMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+struct NFA;
+
+namespace ue2 {
+
+/**
+ * \brief A TamaProto that contains top remapping and reports info
+ */
+struct TamaProto {
+    void add(const NFA *n, const u32 id, const u32 top,
+             const std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+    /** Top remapping between <vertex id, top value> and
+     ** remapped top value. */
+    std::map<std::pair<u32, u32>, u32> top_remap;
+
+    /** All the reports in subengines */
+    std::set<ReportID> reports;
+};
+
+/**
+ * \brief Contruction info for a Tamarama engine:
+ * contains at least two subengines.
+ *
+ * A TamaInfo is converted into a single NFA, with each top triggering a
+ * subengine. A TamaInfo can contain at most TamaInfo::max_occupancy
+ * subengines.
+ */
+struct TamaInfo {
+    static constexpr size_t max_occupancy = 65536; // arbitrary limit
+
+    /** \brief Add a new subengine. */
+    void add(NFA* sub, const std::set<u32> &top);
+
+    /** \brief All the subengines */
+    std::vector<NFA *> subengines;
+
+    /** \brief Tops of subengines */
+    std::vector<std::set<u32>> tops;
+};
+
+std::set<ReportID> all_reports(const TamaProto &proto);
+
+/**
+ * Take in a collection of exclusive subengines and produces a tamarama, also
+ * returns via out_top_remap, a mapping indicating how tops in the subengines in
+ * relate to the tamarama's tops.
+ */
+ue2::aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo,
+                      const u32 queue,
+                      std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+} // namespace ue2
+
+#endif // NFA_TAMARAMACOMPILE_H
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
index 8863c71a..1eff269a 100644
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 #include "truffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include "truffle_common.h"
 
diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h
index 122f65c4..7368e550 100644
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /*
  * Common stuff for all versions of truffle (single, multi and multidouble)
@@ -49,7 +48,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
-#define shift128r(a, b) _mm_srli_epi64((a), (b))
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
 
@@ -60,7 +58,7 @@ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
     m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
     m128 t1 = xor128(v, highconst);
     m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
-    m128 t2 = andnot128(highconst, shift128r(v, 4));
+    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
     m128 shuf3 = pshufb(shuf_mask_hi, t2);
     m128 tmp = and128(or128(shuf1, shuf2), shuf3);
     m128 tmp2 = eq128(tmp, zeroes128());
@@ -103,7 +101,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
-#define shift256r(a, b) _mm256_srli_epi64((a), (b))
 static really_inline
 u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
 
@@ -114,7 +111,7 @@ u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
     m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
     m256 t1 = xor256(v, highconst);
     m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
-    m256 t2 = andnot256(highconst, shift256r(v, 4));
+    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
     m256 shuf3 = vpshufb(shuf_mask_hi, t2);
     m256 tmp = and256(or256(shuf1, shuf2), shuf3);
     m256 tmp2 = eq256(tmp, zeroes256());
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
index 1883a44c..0749470f 100644
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -138,7 +138,7 @@ const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
     for (; buf + 16 < buf_end; buf += 16) {
         m128 data = load128(buf);
         u32 z = movemask128(and128(eq128(chars1, data),
-                            shiftRight8Bits(eq128(chars2, data))));
+                                   rshiftbyte_m128(eq128(chars2, data), 1)));
         if (buf[15] == c1 && buf[16] == c2) {
             z |= (1 << 15);
         }
@@ -161,7 +161,7 @@ const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
         m128 data = load128(buf);
         m128 v = and128(casemask, data);
         u32 z = movemask128(and128(eq128(chars1, v),
-                            shiftRight8Bits(eq128(chars2, v))));
+                                   rshiftbyte_m128(eq128(chars2, v), 1)));
         if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
             z |= (1 << 15);
         }
@@ -182,8 +182,10 @@ const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
 
     for (; buf + 16 < buf_end; buf += 16) {
         m128 data = load128(buf);
-        u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-                   shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+        m128 v1 = eq128(chars1, and128(data, mask1));
+        m128 v2 = eq128(chars2, and128(data, mask2));
+        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
         if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
             z |= (1 << 15);
         }
@@ -201,7 +203,7 @@ static really_inline
 const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
     u32 z = movemask128(and128(eq128(chars1, data),
-                        shiftRight8Bits(eq128(chars2, data))));
+                               rshiftbyte_m128(eq128(chars2, data), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -219,7 +221,7 @@ const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars1, v),
-                               shiftRight8Bits(eq128(chars2, v))));
+                               rshiftbyte_m128(eq128(chars2, v), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -234,8 +236,9 @@ static really_inline
 const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
                                   m128 mask1, m128 mask2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-               shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+    m128 v1 = eq128(chars1, and128(data, mask1));
+    m128 v2 = eq128(chars2, and128(data, mask2));
+    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -324,7 +327,7 @@ const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
     for (; buf + 16 < buf_end; buf_end -= 16) {
         m128 data = load128(buf_end - 16);
         u32 z = movemask128(and128(eq128(chars2, data),
-                            shiftLeft8Bits(eq128(chars1, data))));
+                                   lshiftbyte_m128(eq128(chars1, data), 1)));
         if (buf_end[-17] == c1 && buf_end[-16] == c2) {
             z |= 1;
         }
@@ -345,7 +348,7 @@ const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
         m128 data = load128(buf_end - 16);
         m128 v = and128(casemask, data);
         u32 z = movemask128(and128(eq128(chars2, v),
-                            shiftLeft8Bits(eq128(chars1, v))));
+                                   lshiftbyte_m128(eq128(chars1, v), 1)));
         if ((buf_end[-17] & CASE_CLEAR) == c1
             && (buf_end[-16] & CASE_CLEAR) == c2) {
             z |= 1;
@@ -362,7 +365,7 @@ static really_inline
 const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf);
     u32 z = movemask128(and128(eq128(chars2, data),
-                               shiftLeft8Bits(eq128(chars1, data))));
+                               lshiftbyte_m128(eq128(chars1, data), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -380,7 +383,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf);
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars2, v),
-                               shiftLeft8Bits(eq128(chars1, v))));
+                               lshiftbyte_m128(eq128(chars1, v), 1)));
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
         return lastMatchOffset(buf + 16, z);
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index b4b34d74..deca3fd5 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -57,13 +57,14 @@
 #include "ng_small_literal_set.h"
 #include "ng_som.h"
 #include "ng_vacuous.h"
+#include "ng_violet.h"
 #include "ng_utf8.h"
 #include "ng_util.h"
 #include "ng_width.h"
 #include "ue2common.h"
 #include "nfa/goughcompile.h"
-#include "smallwrite/smallwrite_build.h"
 #include "rose/rose_build.h"
+#include "smallwrite/smallwrite_build.h"
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/depth.h"
@@ -75,14 +76,15 @@ using namespace std;
 
 namespace ue2 {
 
-NG::NG(const CompileContext &in_cc, unsigned in_somPrecision)
+NG::NG(const CompileContext &in_cc, size_t num_patterns,
+       unsigned in_somPrecision)
     : maxSomRevHistoryAvailable(in_cc.grey.somMaxRevNfaLength),
       minWidth(depth::infinity()),
       rm(in_cc.grey),
       ssm(in_somPrecision),
       cc(in_cc),
-      rose(makeRoseBuilder(rm, ssm, cc, boundary)),
-      smwr(makeSmallWriteBuilder(rm, cc)) {
+      smwr(makeSmallWriteBuilder(num_patterns, rm, cc)),
+      rose(makeRoseBuilder(rm, ssm, *smwr, cc, boundary)) {
 }
 
 NG::~NG() {
@@ -103,6 +105,7 @@ bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
     DEBUG_PRINTF("doing som\n");
     dumpComponent(g, "03_presom", w.expressionIndex, comp_id, ng.cc.grey);
     assert(hasCorrectlyNumberedVertices(g));
+    assert(allMatchStatesHaveReports(w));
 
     // First, we try the "SOM chain" support in ng_som.cpp.
 
@@ -206,6 +209,8 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
 
     dumpComponent(g, "01_begin", w.expressionIndex, comp_id, ng.cc.grey);
 
+    assert(allMatchStatesHaveReports(w));
+
     reduceGraph(g, som, w.utf8, cc);
 
     dumpComponent(g, "02_reduced", w.expressionIndex, comp_id, ng.cc.grey);
@@ -230,6 +235,8 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         }
     }
 
+    assert(allMatchStatesHaveReports(w));
+
     if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
         return true;
     }
@@ -243,6 +250,10 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
+    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
+        return true;
+    }
+
     if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
         return true;
     }
@@ -260,6 +271,10 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
+    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
+        return true;
+    }
+
     if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
         return true;
     }
@@ -579,7 +594,8 @@ bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
 
     minWidth = min(minWidth, depth(literal.length()));
 
-    smwr->add(literal, id); /* inform small write handler about this literal */
+    /* inform small write handler about this literal */
+    smwr->add(literal, id);
 
     return true;
 }
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index 52353da9..4aa6a7dc 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,7 +87,8 @@ class SmallWriteBuild;
 
 class NG : boost::noncopyable {
 public:
-    NG(const CompileContext &in_cc, unsigned in_somPrecision);
+    NG(const CompileContext &in_cc, size_t num_patterns,
+       unsigned in_somPrecision);
     ~NG();
 
     /** \brief Consumes a pattern, returns false or throws a CompileError
@@ -118,8 +119,8 @@ public:
     BoundaryReports boundary;
     const CompileContext cc;
 
-    const std::unique_ptr<RoseBuild> rose; //!< Rose builder.
     const std::unique_ptr<SmallWriteBuild> smwr; //!< SmallWrite builder.
+    const std::unique_ptr<RoseBuild> rose; //!< Rose builder.
 };
 
 /** \brief Run graph reduction passes.
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index 1b6d8826..ba352e60 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -119,7 +119,7 @@ NFAVertex findReformable(const NGHolder &g, const set<NFAVertex> &starts,
     }
 
     if (dotq.empty()) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     const DotInfo &dot = dotq.top();
@@ -165,10 +165,10 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
         return;
     }
 
-    NFAVertex dotV = NFAGraph::null_vertex();
+    NFAVertex dotV = NGHolder::null_vertex();
     set<NFAVertex> otherV;
     dotV = findReformable(g, compAnchoredStarts, otherV);
-    if (dotV == NFAGraph::null_vertex()) {
+    if (dotV == NGHolder::null_vertex()) {
         DEBUG_PRINTF("no candidate reformable dot found.\n");
         return;
     }
@@ -268,10 +268,10 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
     }
 
     while (true) {
-        NFAVertex dotV = NFAGraph::null_vertex();
+        NFAVertex dotV = NGHolder::null_vertex();
         set<NFAVertex> otherV;
         dotV = findReformable(g, compUnanchoredStarts, otherV);
-        if (dotV == NFAGraph::null_vertex()) {
+        if (dotV == NGHolder::null_vertex()) {
             DEBUG_PRINTF("no candidate reformable dot found.\n");
             return;
         }
@@ -464,7 +464,7 @@ void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
     // The first of our optional dots must be connected to start. The jump edge
     // past it will be verified in gatherParticipants(). If start is
     // graph.start, it should not be connected to startDs.
-    NFAVertex initialDot = NFAGraph::null_vertex();
+    NFAVertex initialDot = NGHolder::null_vertex();
     for (auto v : adjacent_vertices_range(start, g)) {
         if (is_special(v, g)) {
             continue;
diff --git a/src/nfagraph/ng_asserts.cpp b/src/nfagraph/ng_asserts.cpp
index 2d02751f..e9e39345 100644
--- a/src/nfagraph/ng_asserts.cpp
+++ b/src/nfagraph/ng_asserts.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -553,6 +553,7 @@ void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
         add_edge(g.startDs, v_4, g);
         remove_edge(orig, g);
         g.renumberEdges();
+        clearReports(g);
     }
 }
 
diff --git a/src/nfagraph/ng_builder.cpp b/src/nfagraph/ng_builder.cpp
index 36ce80b0..8a92b7ee 100644
--- a/src/nfagraph/ng_builder.cpp
+++ b/src/nfagraph/ng_builder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,7 +131,7 @@ NFABuilderImpl::~NFABuilderImpl() {
 NFAVertex NFABuilderImpl::getVertex(Position pos) const {
     assert(id2vertex.size() >= pos);
     const NFAVertex v = id2vertex[pos];
-    assert(v != NFAGraph::null_vertex());
+    assert(v != NGHolder::null_vertex());
     assert(graph->g[v].index == pos);
     return v;
 }
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index 5ca5ce3a..658e7001 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -219,8 +219,8 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
 
 static
 void removeVertices(const flat_set<NFAVertex> &verts, NFAUndirectedGraph &ug,
-                    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
-                    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &new2old) {
+                   ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
+                   ue2::unordered_map<NFAUndirectedVertex, NFAVertex> &new2old) {
     for (auto v : verts) {
         assert(contains(old2new, v));
         auto uv = old2new.at(v);
@@ -280,7 +280,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
     createUnGraph(g.g, true, true, ug, old2new, newIdx2old);
 
     // Construct reverse mapping.
-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> new2old;
+    ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
     for (const auto &m : old2new) {
         new2old.emplace(m.second, m.first);
     }
@@ -308,7 +308,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
 
     // Collect vertex lists per component.
     for (const auto &m : split_components) {
-        NFAVertex uv = m.first;
+        NFAUndirectedVertex uv = m.first;
         u32 c = m.second;
         assert(contains(new2old, uv));
         NFAVertex v = new2old.at(uv);
@@ -363,6 +363,12 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         *shell_comp = true;
     }
 
+    // Ensure that only vertices with accept edges have reports.
+    for (auto &gc : comps) {
+        assert(gc);
+        clearReports(*gc);
+    }
+
     // We should never produce empty component graphs.
     assert(all_of(begin(comps), end(comps),
                   [](const unique_ptr<NGHolder> &g_comp) {
diff --git a/src/nfagraph/ng_dump.cpp b/src/nfagraph/ng_dump.cpp
index 60122cf3..57668caf 100644
--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -450,7 +450,13 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
             fprintf(f, " reverse nfa: %u", report.revNfaIndex);
         }
         if (isSomRelSetReport(report)) {
-            fprintf(f, " set, adjust: %lld", report.somDistance);
+            fprintf(f, " set, adjust: %llu", report.somDistance);
+        }
+        if (report.type == EXTERNAL_CALLBACK_SOM_REL) {
+            fprintf(f, " relative: %llu", report.somDistance);
+        }
+        if (report.type == EXTERNAL_CALLBACK_SOM_ABS) {
+            fprintf(f, " absolute: %llu", report.somDistance);
         }
         fprintf(f, "\n");
     }
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index b8e5a8d6..d0ab7c4a 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@
 #include "ng_util.h"
 #include "util/compile_context.h"
 #include "util/graph_range.h"
-#include "util/order_check.h"
+#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <set>
@@ -53,9 +53,8 @@ using boost::ptr_vector;
 namespace ue2 {
 
 enum EquivalenceType {
-    LEFT_EQUIVALENCE = 0,
+    LEFT_EQUIVALENCE,
     RIGHT_EQUIVALENCE,
-    MAX_EQUIVALENCE
 };
 
 namespace {
@@ -91,7 +90,6 @@ public:
 }
 
 typedef ue2::unordered_set<VertexInfo *, VertexInfoPtrCmp> VertexInfoSet;
-typedef ue2::unordered_map<unsigned, VertexInfoSet> ClassMap;
 
 // compare two vertex info pointers on their vertex index
 bool VertexInfoPtrCmp::operator()(const VertexInfo *a,
@@ -118,27 +116,34 @@ public:
         DepthMinMax d1;
         DepthMinMax d2;
     };
-    ClassInfo(const NGHolder &g, VertexInfo &vi, ClassDepth &d_in,
+    ClassInfo(const NGHolder &g, const VertexInfo &vi, const ClassDepth &d_in,
               EquivalenceType eq)
-        : vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
-          depth(d_in) {
+        : /* reports only matter for right-equiv */
+          rs(eq == RIGHT_EQUIVALENCE ? g[vi.v].reports : flat_set<ReportID>()),
+          vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
+          adjacent_cr(eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr),
+          /* treat non-special vertices the same */
+          node_type(min(g[vi.v].index, u32{N_SPECIALS})), depth(d_in) {}
 
-        // hackety-hack!
-        node_type = g[vi.v].index;
-        if (node_type > N_SPECIALS) {
-            // we treat all regular vertices the same
-            node_type = N_SPECIALS;
-        }
-
-        // get all the adjacent vertices' CharReach
-        adjacent_cr = eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr;
-
-        if (eq == RIGHT_EQUIVALENCE) {
-            rs = g[vi.v].reports;
-        }
+    bool operator==(const ClassInfo &b) const {
+        return node_type == b.node_type && depth.d1 == b.depth.d1 &&
+               depth.d2 == b.depth.d2 && cr == b.cr &&
+               adjacent_cr == b.adjacent_cr && edge_top == b.edge_top &&
+               vertex_flags == b.vertex_flags && rs == b.rs;
     }
 
-    bool operator<(const ClassInfo &b) const;
+    friend size_t hash_value(const ClassInfo &c) {
+        size_t val = 0;
+        boost::hash_combine(val, boost::hash_range(begin(c.rs), end(c.rs)));
+        boost::hash_combine(val, c.vertex_flags);
+        boost::hash_combine(val, c.edge_top);
+        boost::hash_combine(val, c.cr);
+        boost::hash_combine(val, c.adjacent_cr);
+        boost::hash_combine(val, c.node_type);
+        boost::hash_combine(val, c.depth.d1);
+        boost::hash_combine(val, c.depth.d2);
+        return val;
+    }
 
 private:
     flat_set<ReportID> rs; /* for right equiv only */
@@ -200,26 +205,12 @@ public:
         return q.capacity();
     }
 private:
-    set<unsigned> ids; //!< stores id's, for uniqueness
+    unordered_set<unsigned> ids; //!< stores id's, for uniqueness
     vector<unsigned> q; //!< vector of id's that we use as FILO.
 };
 
 }
 
-bool ClassInfo::operator<(const ClassInfo &b) const {
-    const ClassInfo &a = *this;
-
-    ORDER_CHECK(node_type);
-    ORDER_CHECK(depth.d1);
-    ORDER_CHECK(depth.d2);
-    ORDER_CHECK(cr);
-    ORDER_CHECK(adjacent_cr);
-    ORDER_CHECK(edge_top);
-    ORDER_CHECK(vertex_flags);
-    ORDER_CHECK(rs);
-    return false;
-}
-
 static
 bool outIsIrreducible(NFAVertex &v, const NGHolder &g) {
     unsigned nonSpecialVertices = 0;
@@ -286,9 +277,14 @@ bool hasEdgeAsserts(NFAVertex v, const NGHolder &g) {
 
 // populate VertexInfo table
 static
-void getVertexInfos(const NGHolder &g, ptr_vector<VertexInfo> &infos) {
+ptr_vector<VertexInfo> getVertexInfos(const NGHolder &g) {
+    const size_t num_verts = num_vertices(g);
+
+    ptr_vector<VertexInfo> infos;
+    infos.reserve(num_verts * 2);
+
     vector<VertexInfo *> vertex_map; // indexed by vertex_index property
-    vertex_map.resize(num_vertices(g));
+    vertex_map.resize(num_verts);
 
     for (auto v : vertices_range(g)) {
         VertexInfo *vi = new VertexInfo(v, g);
@@ -323,14 +319,24 @@ void getVertexInfos(const NGHolder &g, ptr_vector<VertexInfo> &infos) {
         }
         assert(!hasEdgeAsserts(cur_vi.v, g));
     }
+
+    return infos;
 }
 
 // store equivalence class in VertexInfo for each vertex
 static
-void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
-                    WorkQueue &work_queue, const NGHolder &g,
-                    EquivalenceType eq) {
-    map<ClassInfo, unsigned> classinfomap;
+vector<VertexInfoSet> partitionGraph(ptr_vector<VertexInfo> &infos,
+                                     WorkQueue &work_queue, const NGHolder &g,
+                                     EquivalenceType eq) {
+    const size_t num_verts = infos.size();
+
+    vector<VertexInfoSet> classes;
+    unordered_map<ClassInfo, unsigned> classinfomap;
+
+    // assume we will have lots of classes, so we don't waste time resizing
+    // these structures.
+    classes.reserve(num_verts);
+    classinfomap.reserve(num_verts);
 
     // get distances from start (or accept) for all vertices
     // only one of them is used at a time, never both
@@ -356,28 +362,25 @@ void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
 
         auto ii = classinfomap.find(ci);
         if (ii == classinfomap.end()) {
-            unsigned new_class = classinfomap.size();
-            vi.equivalence_class = new_class;
-
-            classinfomap[ci] = new_class;
-
-            // insert this vertex into the class map
-            VertexInfoSet &vertices = classes[new_class];
-            vertices.insert(&vi);
+            // vertex is in a new equivalence class by itself.
+            unsigned eq_class = classes.size();
+            vi.equivalence_class = eq_class;
+            classes.push_back({&vi});
+            classinfomap.emplace(move(ci), eq_class);
         } else {
+            // vertex is added to an existing class.
             unsigned eq_class = ii->second;
             vi.equivalence_class = eq_class;
-
-            // insert this vertex into the class map
-            VertexInfoSet &vertices = classes[eq_class];
-            vertices.insert(&vi);
+            classes.at(eq_class).insert(&vi);
 
             // we now know that this particular class has more than one
             // vertex, so we add it to the work queue
             work_queue.push(eq_class);
         }
     }
-    DEBUG_PRINTF("partitioned, %zu equivalence classes\n", classinfomap.size());
+
+    DEBUG_PRINTF("partitioned, %zu equivalence classes\n", classes.size());
+    return classes;
 }
 
 // generalized equivalence processing (left and right)
@@ -388,7 +391,7 @@ void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
 // equivalence, predecessors for right equivalence) classes get revalidated in
 // case of a split.
 static
-void equivalence(ClassMap &classmap, WorkQueue &work_queue,
+void equivalence(vector<VertexInfoSet> &classes, WorkQueue &work_queue,
                  EquivalenceType eq_type) {
     // now, go through the work queue until it's empty
     map<flat_set<unsigned>, VertexInfoSet> tentative_classmap;
@@ -397,12 +400,11 @@ void equivalence(ClassMap &classmap, WorkQueue &work_queue,
     WorkQueue reval_queue(work_queue.capacity());
 
     while (!work_queue.empty()) {
-
         // dequeue our class from the work queue
         unsigned cur_class = work_queue.pop();
 
         // get all vertices in current equivalence class
-        VertexInfoSet &cur_class_vertices = classmap[cur_class];
+        VertexInfoSet &cur_class_vertices = classes.at(cur_class);
 
         if (cur_class_vertices.size() < 2) {
             continue;
@@ -445,16 +447,20 @@ void equivalence(ClassMap &classmap, WorkQueue &work_queue,
 
             // start from the second class
             for (++tmi; tmi != tentative_classmap.end(); ++tmi) {
-                unsigned new_class = classmap.size();
                 const VertexInfoSet &vertices_to_split = tmi->second;
-                VertexInfoSet &new_class_vertices = classmap[new_class];
+                unsigned new_class = classes.size();
+                VertexInfoSet new_class_vertices;
 
                 for (VertexInfo *vi : vertices_to_split) {
                     vi->equivalence_class = new_class;
-                    cur_class_vertices.erase(vi);
+                    // note: we cannot use the cur_class_vertices ref, as it is
+                    // invalidated by modifications to the classes vector.
+                    classes[cur_class].erase(vi);
                     new_class_vertices.insert(vi);
                 }
-                if (tmi->first.find(cur_class) != tmi->first.end()) {
+                classes.push_back(move(new_class_vertices));
+
+                if (contains(tmi->first, cur_class)) {
                     reval_queue.push(new_class);
                 }
             }
@@ -619,16 +625,15 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
 // vertex (or, in rare cases for left equiv, a pair if we cannot satisfy the
 // report behaviour with a single vertex).
 static
-bool mergeEquivalentClasses(ClassMap &classmap, ptr_vector<VertexInfo> &infos,
-                            NGHolder &g) {
+bool mergeEquivalentClasses(vector<VertexInfoSet> &classes,
+                            ptr_vector<VertexInfo> &infos, NGHolder &g) {
     bool merged = false;
     set<NFAVertex> toRemove;
 
     // go through all classes and merge classes with more than one vertex
-    for (auto &cm : classmap) {
+    for (unsigned eq_class = 0; eq_class < classes.size(); eq_class++) {
         // get all vertices in current equivalence class
-        unsigned eq_class = cm.first;
-        VertexInfoSet &cur_class_vertices = cm.second;
+        VertexInfoSet &cur_class_vertices = classes[eq_class];
 
         // we don't care for single-vertex classes
         if (cur_class_vertices.size() > 1) {
@@ -644,6 +649,26 @@ bool mergeEquivalentClasses(ClassMap &classmap, ptr_vector<VertexInfo> &infos,
     return merged;
 }
 
+static
+bool reduceGraphEquivalences(NGHolder &g, EquivalenceType eq_type) {
+    // create a list of equivalence classes to check
+    WorkQueue work_queue(num_vertices(g));
+
+    // get information on every vertex in the graph
+    // new vertices are allocated here, and stored in infos
+    ptr_vector<VertexInfo> infos = getVertexInfos(g);
+
+    // partition the graph
+    auto classes = partitionGraph(infos, work_queue, g, eq_type);
+
+    // do equivalence processing
+    equivalence(classes, work_queue, eq_type);
+
+    // replace equivalent classes with single vertices
+    // new vertices are (possibly) allocated here, and stored in infos
+    return mergeEquivalentClasses(classes, infos, g);
+}
+
 bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
     if (!cc.grey.equivalenceEnable) {
         DEBUG_PRINTF("equivalence processing disabled in grey box\n");
@@ -661,34 +686,8 @@ bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
 
     // take note if we have merged any vertices
     bool merge = false;
-
-    for (int eqi = 0; eqi < MAX_EQUIVALENCE; ++eqi) {
-        // map of all information pertaining a vertex
-        ptr_vector<VertexInfo> infos;
-        ClassMap classes;
-
-        // create a list of equivalence classes to check
-        WorkQueue work_queue(num_vertices(g));
-        EquivalenceType eq_type = (EquivalenceType) eqi;
-
-        // resize the vector, make room for twice the vertices we have
-        infos.reserve(num_vertices(g) * 2);
-
-        // get information on every vertex in the graph
-        // new vertices are allocated here, and stored in infos
-        getVertexInfos(g, infos);
-
-        // partition the graph
-        partitionGraph(infos, classes, work_queue, g, eq_type);
-
-        // do equivalence processing
-        equivalence(classes, work_queue, eq_type);
-
-        // replace equivalent classes with single vertices
-        // new vertices are (possibly) allocated here, and stored in infos
-        merge |= mergeEquivalentClasses(classes, infos, g);
-    }
-
+    merge |= reduceGraphEquivalences(g, LEFT_EQUIVALENCE);
+    merge |= reduceGraphEquivalences(g, RIGHT_EQUIVALENCE);
     return merge;
 }
 
diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index 92bef737..4ffd89c0 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,7 +58,7 @@ namespace ue2 {
 
 struct StateInfo {
     StateInfo(NFAVertex v, const CharReach &cr) : vertex(v), reach(cr) {}
-    StateInfo() : vertex(NFAGraph::null_vertex()) {}
+    StateInfo() : vertex(NGHolder::null_vertex()) {}
     NFAVertex vertex;
     CharReach reach;
 };
@@ -324,4 +324,49 @@ flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
                          initial_states);
 }
 
+static
+bool can_die_early(const NGHolder &g, const vector<StateInfo> &info,
+                   const dynamic_bitset<> &s,
+                   map<dynamic_bitset<>, u32> &visited, u32 age_limit) {
+    if (contains(visited, s) && visited[s] >= age_limit) {
+        /* we have already (or are in the process) of visiting here with a
+         * looser limit. */
+        return false;
+    }
+    visited[s] = age_limit;
+
+    if (s.none()) {
+        DEBUG_PRINTF("dead\n");
+        return true;
+    }
+
+    if (age_limit == 0) {
+        return false;
+    }
+
+    dynamic_bitset<> all_succ(s.size());
+    step(g, info, s, &all_succ);
+    all_succ.reset(NODE_START_DOTSTAR);
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        dynamic_bitset<> next = all_succ;
+        filter_by_reach(info, &next, CharReach(i));
+        if (can_die_early(g, info, next, visited, age_limit - 1)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool can_die_early(const NGHolder &g, u32 age_limit) {
+    if (proper_out_degree(g.startDs, g)) {
+        return false;
+    }
+    const vector<StateInfo> &info = makeInfoTable(g);
+    map<dynamic_bitset<>, u32> visited;
+    return can_die_early(g, info, makeStateBitset(g, {g.start}), visited,
+                         age_limit);
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_execute.h b/src/nfagraph/ng_execute.h
index e2c7c72d..bdcfecfd 100644
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,6 +64,9 @@ flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
                                   const flat_set<NFAVertex> &input_start_states,
                                   const flat_set<NFAVertex> &initial);
 
+/* returns true if it is possible for the nfa to die within age_limit bytes */
+bool can_die_early(const NGHolder &g, u32 age_limit);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index 17d2a513..bc101df2 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -294,21 +294,21 @@ bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
 
 static
 NFAVertex findSingleCyclic(const NGHolder &g) {
-    NFAVertex v = NFAGraph::null_vertex();
+    NFAVertex v = NGHolder::null_vertex();
     for (const auto &e : edges_range(g)) {
         if (source(e, g) == target(e, g)) {
             if (source(e, g) == g.startDs) {
                 continue;
             }
-            if (v != NFAGraph::null_vertex()) {
+            if (v != NGHolder::null_vertex()) {
                 // More than one cyclic vertex.
-                return NFAGraph::null_vertex();
+                return NGHolder::null_vertex();
             }
             v = source(e, g);
         }
     }
 
-    if (v != NFAGraph::null_vertex()) {
+    if (v != NGHolder::null_vertex()) {
         DEBUG_PRINTF("cyclic is %u\n", g[v].index);
         assert(!is_special(v, g));
     }
@@ -359,11 +359,11 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     // The graph must contain a single cyclic vertex (other than startDs), and
     // that vertex can have one pred and one successor.
     NFAVertex cyclic = findSingleCyclic(g);
-    if (cyclic == NFAGraph::null_vertex()) {
+    if (cyclic == NGHolder::null_vertex()) {
         return false;
     }
 
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     tie(ai, ae) = adjacent_vertices(g.start, g);
     if (*ai == g.startDs) {
         ++ai;
@@ -411,7 +411,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
 
     // Check the cyclic state is A-OK.
     v = getSoleDestVertex(g, cyclic);
-    if (v == NFAGraph::null_vertex()) {
+    if (v == NGHolder::null_vertex()) {
         DEBUG_PRINTF("cyclic has more than one successor\n");
         return false;
     }
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 8fe4889d..e70b7708 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -89,11 +89,11 @@ void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
     }
 
     v_by_index->clear();
-    v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
+    v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
 
     for (auto v : vertices_range(g)) {
         u32 v_index = g[v].index;
-        assert((*v_by_index)[v_index] == NFAGraph::null_vertex());
+        assert((*v_by_index)[v_index] == NGHolder::null_vertex());
         (*v_by_index)[v_index] = v;
     }
 }
diff --git a/src/nfagraph/ng_holder.cpp b/src/nfagraph/ng_holder.cpp
index fd403378..53566891 100644
--- a/src/nfagraph/ng_holder.cpp
+++ b/src/nfagraph/ng_holder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -154,7 +154,7 @@ void clear_out_edges(NFAVertex v, NGHolder &h) {
 }
 
 void clear_graph(NGHolder &h) {
-    NFAGraph::vertex_iterator vi, ve;
+    NGHolder::vertex_iterator vi, ve;
     for (tie(vi, ve) = vertices(h); vi != ve;) {
         NFAVertex v = *vi;
         ++vi;
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index 3243f665..f0a387d0 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -171,7 +171,7 @@ bool is_special(const NFAVertex v, const GraphT &g) {
 }
 
 static really_inline
-std::pair<NFAGraph::adjacency_iterator, NFAGraph::adjacency_iterator>
+std::pair<NGHolder::adjacency_iterator, NGHolder::adjacency_iterator>
 adjacent_vertices(NFAVertex v, const NGHolder &h) {
     return adjacent_vertices(v, h.g);
 }
@@ -182,7 +182,7 @@ std::pair<NFAEdge, bool> edge(NFAVertex u, NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::edge_iterator, NFAGraph::edge_iterator>
+std::pair<NGHolder::edge_iterator, NGHolder::edge_iterator>
 edges(const NGHolder &h) {
     return edges(h.g);
 }
@@ -193,13 +193,13 @@ size_t in_degree(NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::in_edge_iterator, NFAGraph::in_edge_iterator>
+std::pair<NGHolder::in_edge_iterator, NGHolder::in_edge_iterator>
 in_edges(NFAVertex v, const NGHolder &h) {
     return in_edges(v, h.g);
 }
 
 static really_inline
-std::pair<NFAGraph::inv_adjacency_iterator, NFAGraph::inv_adjacency_iterator>
+std::pair<NGHolder::inv_adjacency_iterator, NGHolder::inv_adjacency_iterator>
 inv_adjacent_vertices(NFAVertex v, const NGHolder &h) {
     return inv_adjacent_vertices(v, h.g);
 }
@@ -210,7 +210,7 @@ size_t out_degree(NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::out_edge_iterator, NFAGraph::out_edge_iterator>
+std::pair<NGHolder::out_edge_iterator, NGHolder::out_edge_iterator>
 out_edges(NFAVertex v, const NGHolder &h) {
     return out_edges(v, h.g);
 }
@@ -226,7 +226,7 @@ NFAVertex target(const NFAEdge &e, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::vertex_iterator, NFAGraph::vertex_iterator>
+std::pair<NGHolder::vertex_iterator, NGHolder::vertex_iterator>
 vertices(const NGHolder &h) {
     return vertices(h.g);
 }
@@ -239,6 +239,16 @@ vertices(const NGHolder &h) {
  */
 void clear_graph(NGHolder &h);
 
+inline
+void renumber_edges(NGHolder &h) {
+    h.renumberEdges();
+}
+
+inline
+void renumber_vertices(NGHolder &h) {
+    h.renumberVertices();
+}
+
 /*
  * \brief Clear and remove all of the vertices pointed to by the given iterator
  * range.
@@ -315,15 +325,26 @@ void remove_edges(const Container &c, NGHolder &h, bool renumber = true) {
     remove_edges(c.begin(), c.end(), h, renumber);
 }
 
-static UNUSED
+inline
 bool is_triggered(const NGHolder &g) {
     return is_triggered(g.kind);
 }
 
-static UNUSED
+inline
 bool generates_callbacks(const NGHolder &g) {
     return generates_callbacks(g.kind);
 }
+
+inline
+bool has_managed_reports(const NGHolder &g) {
+    return has_managed_reports(g.kind);
+}
+
+inline
+bool inspects_states_for_accepts(const NGHolder &g) {
+    return inspects_states_for_accepts(g.kind);
+}
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 713fe370..72efa43a 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -79,13 +79,17 @@ bool sanityCheckGraph(const NGHolder &g,
             }
         }
 
-        // Vertices with edges to accept or acceptEod must have reports.
+        // Vertices with edges to accept or acceptEod must have reports and
+        // other vertices must not have them.
         if (is_match_vertex(v, g) && v != g.accept) {
             if (g[v].reports.empty()) {
-                DEBUG_PRINTF("vertex %u has no reports\n",
-                             g[v].index);
+                DEBUG_PRINTF("vertex %u has no reports\n", g[v].index);
                 return false;
             }
+        } else if (!g[v].reports.empty()) {
+            DEBUG_PRINTF("vertex %u has reports but no accept edge\n",
+                         g[v].index);
+            return false;
         }
 
         // Participant vertices should have distinct state indices.
@@ -164,7 +168,7 @@ void makeTopStates(NGHolder &g, map<u32, NFAVertex> &tops,
 
         assert(!contains(tops, t));
 
-        NFAVertex s = NFAGraph::null_vertex();
+        NFAVertex s = NGHolder::null_vertex();
         flat_set<NFAVertex> succs;
         insert(&succs, top.second);
 
@@ -373,7 +377,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, vector<vector<CharReach>>> &triggers,
              bool compress_state, bool do_accel, bool impl_test_only, u32 hint,
              const CompileContext &cc) {
-    if (!generates_callbacks(h_in)) {
+    if (!has_managed_reports(h_in)) {
         rm = nullptr;
     } else {
         assert(rm);
@@ -413,7 +417,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 
     set<NFAVertex> zombies = findZombies(*h, br_cyclic, state_ids, cc);
 
-    if (generates_callbacks(*h)) {
+    if (has_managed_reports(*h)) {
         assert(rm);
         remapReportsToPrograms(*h, *rm);
     }
@@ -501,6 +505,9 @@ aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in, u32 hint,
 
 u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
                        const CompileContext &cc) {
+    if (!cc.grey.allowLimExNFA) {
+        return false;
+    }
     // Quick check: we can always implement an NFA with less than NFA_MAX_STATES
     // states. Note that top masks can generate extra states, so we account for
     // those here too.
@@ -508,7 +515,7 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
         return true;
     }
 
-    if (!generates_callbacks(g)) {
+    if (!has_managed_reports(g)) {
         rm = nullptr;
     } else {
         assert(rm);
@@ -547,7 +554,7 @@ void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm
 
     removeRedundancy(g, som);
 
-    if (rm && generates_callbacks(g)) {
+    if (rm && has_managed_reports(g)) {
         pruneHighlanderDominated(g, *rm);
     }
 
@@ -560,7 +567,7 @@ void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm
 
 u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
                      const CompileContext &cc) {
-    if (!generates_callbacks(g)) {
+    if (!has_managed_reports(g)) {
         rm = nullptr;
     } else {
         assert(rm);
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 1f991f19..deaf2ffd 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -658,7 +658,7 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
         return g.startDs;
     }
 
-    NFAVertex v = NFAGraph::null_vertex();
+    NFAVertex v = NGHolder::null_vertex();
     for (auto w : adjacent_vertices_range(g.start, g)) {
         if (w != g.startDs) {
             if (!v) {
@@ -693,8 +693,8 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
 
 static
 NFAVertex find_next(const NFAVertex v, const NGHolder &g) {
-    NFAVertex res = NFAGraph::null_vertex();
-    for (NFAVertex u :  adjacent_vertices_range(v, g)) {
+    NFAVertex res = NGHolder::null_vertex();
+    for (NFAVertex u : adjacent_vertices_range(v, g)) {
         if (u != v) {
             res = u;
             break;
@@ -736,7 +736,7 @@ MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
 
     // find our start vertex
     NFAVertex cur = find_next(v, g);
-    if (cur == NFAGraph::null_vertex()) {
+    if (cur == NGHolder::null_vertex()) {
         DEBUG_PRINTF("invalid start vertex\n");
         return MultibyteAccelInfo();
     }
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index f9ef6061..9229457c 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,10 +64,6 @@ namespace {
 /* Small literal graph type used for the suffix tree used in
  * compressAndScore. */
 
-typedef boost::adjacency_list_traits<boost::vecS, boost::vecS,
-                                     boost::bidirectionalS> LitGraphTraits;
-typedef LitGraphTraits::vertex_descriptor LitVertex;
-typedef LitGraphTraits::edge_descriptor LitEdge;
 
 struct LitGraphVertexProps {
     LitGraphVertexProps() {}
@@ -79,11 +75,15 @@ struct LitGraphEdgeProps {
     LitGraphEdgeProps() {}
     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
     u64a score = NO_LITERAL_AT_EDGE_SCORE;
+    size_t index; /* only initialised when the reverse edges are added. */
 };
 
+/* keep edgeList = listS as you cannot remove edges if edgeList = vecS */
 typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS,
                               LitGraphVertexProps, LitGraphEdgeProps,
                               boost::no_property> LitGraph;
+typedef LitGraph::vertex_descriptor LitVertex;
+typedef LitGraph::edge_descriptor LitEdge;
 
 typedef pair<LitVertex, NFAVertex> VertexPair;
 typedef std::queue<VertexPair> LitVertexQ;
@@ -339,6 +339,12 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
                  g[source(e, g)].index, g[target(e, g)].index, s.size());
 }
 
+bool bad_mixed_sensitivity(const ue2_literal &s) {
+    /* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,
+     * we should be able to handle it */
+    return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
+}
+
 static
 u64a litUniqueness(const string &s) {
     CharReach seen(s);
@@ -474,43 +480,36 @@ const char *describeColor(boost::default_color_type c) {
 
 /**
  * The BGL's boykov_kolmogorov_max_flow requires that all edges have their
- * reverse edge in the graph. This function adds them, returning the new edges
- * and constructing a map of (edge, rev edge).
+ * reverse edge in the graph. This function adds them, returning a vector
+ * mapping edge index to reverse edge. Note: LitGraph should be a DAG so there
+ * should be no existing reverse_edges.
  */
 static
-vector<LitEdge> addReverseEdges(LitGraph &lg,
-                ue2::unordered_map<LitEdge, LitEdge> &reverse_edge_map) {
-    vector<LitEdge> reverseMe;
-
-    reverse_edge_map.clear();
-    reverse_edge_map.reserve(num_edges(lg) * 2);
+vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
+    vector<LitEdge> fwd_edges;
 
+    size_t next_index = 0;
     for (const auto &e : edges_range(lg)) {
-        LitVertex u = source(e, lg), v = target(e, lg);
-        assert(u != v);
-
-        bool exists;
-        LitEdge rev;
-        tie(rev, exists) = edge(v, u, lg);
-        if (exists) {
-            reverse_edge_map[e] = rev;
-        } else {
-            reverseMe.push_back(e);
-        }
+        lg[e].index = next_index++;
+        fwd_edges.push_back(e);
     }
 
-    vector<LitEdge> reverseEdges;
-    reverseEdges.reserve(reverseMe.size());
+    vector<LitEdge> rev_map(2 * num_edges(lg));
 
-    for (const auto &e : reverseMe) {
-        LitVertex u = source(e, lg), v = target(e, lg);
-        LitEdge rev = add_edge(v, u, lg[e], lg).first;
-        reverseEdges.push_back(rev);
-        reverse_edge_map[e] = rev;
-        reverse_edge_map[rev] = e;
+    for (const auto &e : fwd_edges) {
+        LitVertex u = source(e, lg);
+        LitVertex v = target(e, lg);
+
+        assert(!edge(v, u, lg).second);
+
+        LitEdge rev = add_edge(v, u, lg).first;
+        lg[rev].score = 0;
+        lg[rev].index = next_index++;
+        rev_map[lg[e].index] = rev;
+        rev_map[lg[rev].index] = e;
     }
 
-    return reverseEdges;
+    return rev_map;
 }
 
 static
@@ -522,33 +521,33 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
 
     assert(!in_degree(root, lg));
     assert(!out_degree(sink, lg));
+    size_t num_real_edges = num_edges(lg);
 
     // Add reverse edges for the convenience of the BGL's max flow algorithm.
-    ue2::unordered_map<LitEdge, LitEdge> reverse_edge_map;
-    vector<LitEdge> tempEdges = addReverseEdges(lg, reverse_edge_map);
+    vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
 
     const auto v_index_map = get(vertex_index, lg);
+    const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
     const size_t num_verts = num_vertices(lg);
     vector<boost::default_color_type> colors(num_verts);
     vector<s32> distances(num_verts);
     vector<LitEdge> predecessors(num_verts);
-    ue2::unordered_map<LitEdge, u64a> residuals;
-    residuals.reserve(num_edges(lg));
+    vector<u64a> residuals(num_edges(lg));
 
     UNUSED u64a flow = boykov_kolmogorov_max_flow(lg,
             get(&LitGraphEdgeProps::score, lg),
-            make_assoc_property_map(residuals),
-            make_assoc_property_map(reverse_edge_map),
+            make_iterator_property_map(residuals.begin(), e_index_map),
+            make_iterator_property_map(rev_edges.begin(), e_index_map),
             make_iterator_property_map(predecessors.begin(), v_index_map),
             make_iterator_property_map(colors.begin(), v_index_map),
             make_iterator_property_map(distances.begin(), v_index_map),
-            get(vertex_index, lg), root, sink);
+            v_index_map, root, sink);
     DEBUG_PRINTF("done, flow = %llu\n", flow);
 
-    // Remove temporary reverse edges.
-    for (const auto &e : tempEdges) {
-        remove_edge(e, lg);
-    }
+    /* remove reverse edges */
+    remove_edge_if([&](const LitEdge &e) {
+                       return lg[e].index >= num_real_edges;
+                   }, lg);
 
     vector<LitEdge> white_cut, black_cut;
     u64a white_flow = 0, black_flow = 0;
@@ -631,6 +630,48 @@ u64a compressAndScore(set<ue2_literal> &s) {
     return score;
 }
 
+/* like compressAndScore, but replaces long mixed sensitivity literals with
+ * something weaker. */
+u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
+    const size_t maxExploded = 8; // only case-explode this far
+
+    /* TODO: the whole compression thing could be made better by systematically
+     * considering replacing literal sets not just by common suffixes but also
+     * by nocase literals. */
+
+    vector<ue2_literal> replacements;
+
+    for (auto it = lits.begin(); it != lits.end();) {
+        auto jt = it;
+        ++it;
+
+        if (!bad_mixed_sensitivity(*jt)) {
+            continue;
+        }
+
+        /* we have to replace *jt with something... */
+        ue2_literal s = *jt;
+        lits.erase(jt);
+
+        vector<ue2_literal> exploded;
+        for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
+            exploded.emplace_back(*cit, false);
+            if (exploded.size() > maxExploded) {
+                goto dont_explode;
+            }
+        }
+        insert(&replacements, replacements.end(), exploded);
+
+        continue;
+    dont_explode:
+        make_nocase(&s);
+        replacements.push_back(s);
+    }
+
+    insert(&lits, replacements);
+    return compressAndScore(lits);
+}
+
 u64a scoreSet(const set<ue2_literal> &s) {
     if (s.empty()) {
         return NO_LITERAL_AT_EDGE_SCORE;
@@ -681,7 +722,7 @@ set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
     return s;
 }
 
-vector<u64a> scoreEdges(const NGHolder &g) {
+vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
     assert(hasCorrectlyNumberedEdges(g));
 
     vector<u64a> scores(num_edges(g));
@@ -689,8 +730,12 @@ vector<u64a> scoreEdges(const NGHolder &g) {
     for (const auto &e : edges_range(g)) {
         u32 eidx = g[e].index;
         assert(eidx < scores.size());
-        set<ue2_literal> ls = getLiteralSet(g, e);
-        scores[eidx] = compressAndScore(ls);
+        if (contains(known_bad, e)) {
+            scores[eidx] = NO_LITERAL_AT_EDGE_SCORE;
+        } else {
+            set<ue2_literal> ls = getLiteralSet(g, e);
+            scores[eidx] = compressAndScore(ls);
+        }
     }
 
     return scores;
@@ -849,4 +894,49 @@ bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
     return true;
 }
 
+bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
+    NFAVertex v = g.accept;
+
+    for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
+        NGHolder::inv_adjacency_iterator ai, ae;
+        tie(ai, ae) = inv_adjacent_vertices(v, g);
+        if (ai == ae) {
+            assert(0); // no predecessors?
+            return false;
+        }
+        v = *ai++;
+        if (ai != ae) {
+            DEBUG_PRINTF("branch, fail\n");
+            return false;
+        }
+
+        if (is_special(v, g)) {
+            DEBUG_PRINTF("special found, fail\n");
+            return false;
+        }
+
+        const CharReach &cr_g = g[v].char_reach;
+        const CharReach &cr_l = *it;
+
+        if (!cr_l.isSubsetOf(cr_g)) {
+            /* running over the prefix is needed to prevent false postives */
+            DEBUG_PRINTF("reach fail\n");
+            return false;
+        }
+    }
+
+    // Our last value for v should have only start states for predecessors.
+    for (auto u : inv_adjacent_vertices_range(v, g)) {
+        if (!is_any_start(u, g)) {
+            DEBUG_PRINTF("pred is not start\n");
+            return false;
+        }
+    }
+
+    assert(num_vertices(g) == lit.length() + N_SPECIALS);
+
+    DEBUG_PRINTF("ok\n");
+    return true;
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_literal_analysis.h b/src/nfagraph/ng_literal_analysis.h
index 4fa72b9f..6fd9c525 100644
--- a/src/nfagraph/ng_literal_analysis.h
+++ b/src/nfagraph/ng_literal_analysis.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,9 +42,7 @@
 namespace ue2 {
 
 #define NO_LITERAL_AT_EDGE_SCORE  10000000ULL
-
-/* Score for special-to-special edges */
-#define INVALID_EDGE_CAP 100000000ULL
+#define INVALID_EDGE_CAP         100000000ULL /* special-to-special score */
 
 class NGHolder;
 
@@ -59,9 +57,20 @@ std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
                                     bool only_first_encounter = true);
 std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e);
 
-/** Score all the edges in the given graph, returning them in \p scores indexed
+/**
+ * Returns true if we are unable to use a mixed sensitivity literal in rose (as
+ * our literal matchers are generally either case sensitive or not).
+ *
+ * Shortish mixed sensitivity literals can be handled by confirm checks in rose
+ * and are not flagged as bad.
+ */
+bool bad_mixed_sensitivity(const ue2_literal &s);
+
+/**
+ * Score all the edges in the given graph, returning them in \p scores indexed
  * by edge_index. */
-std::vector<u64a> scoreEdges(const NGHolder &h);
+std::vector<u64a> scoreEdges(const NGHolder &h,
+                             const flat_set<NFAEdge> &known_bad = {});
 
 /** Returns a score for a literal set. Lower scores are better. */
 u64a scoreSet(const std::set<ue2_literal> &s);
@@ -69,6 +78,12 @@ u64a scoreSet(const std::set<ue2_literal> &s);
 /** Compress a literal set to fewer literals. */
 u64a compressAndScore(std::set<ue2_literal> &s);
 
+/**
+ * Compress a literal set to fewer literals and replace any long mixed
+ * sensitivity literals with supported literals.
+ */
+u64a sanitizeAndCompressAndScore(std::set<ue2_literal> &s);
+
 bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
                             NGHolder *rhs);
 
@@ -77,6 +92,10 @@ bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
 
 bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out);
 
+/** \brief Returns true if the given literal is the only thing in the graph,
+ * from (start or startDs) to accept. */
+bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index 9ee4f151..871c8ac7 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -186,7 +186,7 @@ bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
 
 /** \brief Split off literals. True if any changes were made to the graph. */
 bool splitOffLiterals(NG &ng, NGWrapper &g) {
-    if (!ng.cc.grey.allowRose) {
+    if (!ng.cc.grey.allowLiteral) {
         return false;
     }
 
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index b1c6ff96..39788570 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -173,11 +173,11 @@ void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
     }
 
     v_by_index->clear();
-    v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
+    v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
 
     for (auto v : vertices_range(g)) {
         u32 vert_id = g[v].index;
-        assert((*v_by_index)[vert_id] == NFAGraph::null_vertex());
+        assert((*v_by_index)[vert_id] == NGHolder::null_vertex());
         (*v_by_index)[vert_id] = v;
     }
 
@@ -531,9 +531,9 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
     DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind);
     assert(allMatchStatesHaveReports(graph));
 
-    bool prunable = grey.highlanderPruneDFA && generates_callbacks(graph);
-    assert(rm || !generates_callbacks(graph));
-    if (!generates_callbacks(graph)) {
+    bool prunable = grey.highlanderPruneDFA && has_managed_reports(graph);
+    assert(rm || !has_managed_reports(graph));
+    if (!has_managed_reports(graph)) {
         rm = nullptr;
     }
 
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index c2b9eea9..8abc45b3 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -135,7 +135,7 @@ void findWidths(const NGHolder &g,
     // Wire our entries to start and our exits to accept.
     for (auto v : ri.vertices) {
         NFAVertex v_new = mapping[v];
-        assert(v_new != NFAGraph::null_vertex());
+        assert(v_new != NGHolder::null_vertex());
 
         if (isRegionEntry(g, v, region_map) &&
             !edge(rg.start, v_new, rg).second) {
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 540f4859..00b2e8ac 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -472,7 +472,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
     }
 
     NFAVertex puffv = nodes.back();
-    assert(puffv != NFAGraph::null_vertex());
+    assert(puffv != NGHolder::null_vertex());
     u32 width = countChain(g, nodes.back());
 
     flat_set<ReportID> chain_reports;
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index b9b80c5b..26599251 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -158,7 +158,7 @@ void populateContainers(const NGHolder &g, VertexInfoMap &infoMap) {
 static
 void inplaceIntersection(vector<NFAVertex> &vset1,
                          const flat_set<NFAVertex> &vset2) {
-    const NFAVertex GONE = NFAGraph::null_vertex();
+    const NFAVertex GONE = NGHolder::null_vertex();
 
     vector<NFAVertex>::iterator it = vset1.begin(), ite = vset1.end();
     flat_set<NFAVertex>::const_iterator jt = vset2.begin(), jte = vset2.end();
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index 80434a0a..bc7e73d3 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1202,7 +1202,7 @@ CharReach predReach(const NGHolder &g, NFAVertex v) {
 static
 void filterMap(const NGHolder &subg,
                ue2::unordered_map<NFAVertex, NFAVertex> &vmap) {
-    NFAGraph::vertex_iterator vi, ve;
+    NGHolder::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(subg);
     const ue2::unordered_set<NFAVertex> remaining_verts(vi, ve);
 
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 3015af4c..137ac5cc 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -750,7 +750,7 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
         for (auto v : lits.back()->vv) {
             if (contains(poisoned, v)) {
                 DEBUG_PRINTF("skipping '%s' as overlapped\n",
-                     ((const string &)*lits.back()->lit.begin()).c_str());
+                             dumpString(*(lits.back()->lit.begin())).c_str());
                 lits.pop_back();
                 goto next_lit;
             }
@@ -760,7 +760,7 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
         lits.pop_back();
         poisonCandidates(*rv);
         DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
-                     ((const string &)*rv->lit.begin()).c_str(),
+                     dumpString(*(rv->lit.begin())).c_str(),
                      g[rv->vv.front()].index,
                      (int)createsAnchoredLHS(g, rv->vv, depths, grey),
                      (int)createsTransientLHS(g, rv->vv, depths, grey));
@@ -773,51 +773,6 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
 
 }
 
-/** \brief Returns true if the given literal is the only thing in the graph,
- * from start to accept. */
-static
-bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
-    NFAVertex v = g.accept;
-
-    for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
-        NFAGraph::inv_adjacency_iterator ai, ae;
-        tie(ai, ae) = inv_adjacent_vertices(v, g);
-        if (ai == ae) {
-            assert(0); // no predecessors?
-            return false;
-        }
-        v = *ai++;
-        if (ai != ae) {
-            DEBUG_PRINTF("branch, fail\n");
-            return false;
-        }
-
-        if (is_special(v, g)) {
-            DEBUG_PRINTF("special found, fail\n");
-            return false;
-        }
-
-        const CharReach &cr = g[v].char_reach;
-        if (cr != *it) {
-            DEBUG_PRINTF("reach fail\n");
-            return false;
-        }
-    }
-
-    // Our last value for v should have only start states for predecessors.
-    for (auto u : inv_adjacent_vertices_range(v, g)) {
-        if (!is_any_start(u, g)) {
-            DEBUG_PRINTF("pred is not start\n");
-            return false;
-        }
-    }
-
-    assert(num_vertices(g) == lit.length() + N_SPECIALS);
-
-    DEBUG_PRINTF("ok\n");
-    return true;
-}
-
 static
 bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
     set<NFAVertex> curr, next;
@@ -860,7 +815,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
         max_delay--;
     }
 
-    DEBUG_PRINTF("killing off '%s'\n", ((const string &)lit).c_str());
+    DEBUG_PRINTF("killing off '%s'\n", dumpString(lit).c_str());
     set<NFAVertex> curr, next;
     curr.insert(g.accept);
 
@@ -917,6 +872,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     }
 
     clear_in_edges(g.accept, g);
+    clearReports(g);
 
     vector<NFAVertex> verts(pred.begin(), pred.end());
     sort(verts.begin(), verts.end(), VertexIndexOrdering<NGHolder>(g));
@@ -933,19 +889,10 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     return delay;
 }
 
-static
 void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay) {
+                                  u32 delay, const vector<NFAVertex> &preds) {
     assert(delay <= lit.length());
-    DEBUG_PRINTF("adding on '%s' %u\n", ((const string &)lit).c_str(), delay);
-
-    vector<NFAVertex> preds;
-    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
-    clear_in_edges(g.accept, g);
-
-    for (auto v : preds) {
-        g[v].reports.clear(); /* clear report from old accepts */
-    }
+    DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
 
     NFAVertex prev = g.accept;
     auto it = lit.rbegin();
@@ -972,6 +919,19 @@ void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     assert(allMatchStatesHaveReports(g));
 }
 
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay) {
+    vector<NFAVertex> preds;
+    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
+    clear_in_edges(g.accept, g);
+
+    for (auto v : preds) {
+        g[v].reports.clear(); /* clear report from old accepts */
+    }
+
+    restoreTrailingLiteralStates(g, lit, delay, preds);
+}
+
 /* return false if we should get rid of the edge altogether */
 static
 bool removeLiteralFromLHS(RoseInGraph &ig, const RoseInEdge &lhs,
@@ -1824,9 +1784,6 @@ bool doNetflowCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
         set<ue2_literal> lits = getLiteralSet(h, e);
         compressAndScore(lits);
         cut_lits[e] = lits;
-
-        DEBUG_PRINTF("cut lit '%s'\n",
-                     ((const string &)*cut_lits[e].begin()).c_str());
     }
 
     /* if literals are underlength bail or if it involves a forbidden edge*/
@@ -2245,7 +2202,7 @@ bool improveLHS(RoseInGraph &ig, const vector<RoseInEdge> &edges,
         const vector<RoseInEdge> &local = by_src[v];
 
         vector<NGHolder *> graphs;
-        map<RoseInVertex, vector<RoseInEdge> > by_graph;
+        map<NGHolder *, vector<RoseInEdge> > by_graph;
         for (const auto &e : local) {
             NGHolder *gp = ig[e].graph.get();
             if (!contains(by_graph, gp)) {
diff --git a/src/nfagraph/ng_rose.h b/src/nfagraph/ng_rose.h
index 4e16a3c4..d180e8a5 100644
--- a/src/nfagraph/ng_rose.h
+++ b/src/nfagraph/ng_rose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,8 +33,11 @@
 #ifndef NG_ROSE_H
 #define NG_ROSE_H
 
+#include "ng_holder.h"
 #include "ue2common.h"
 
+#include <vector>
+
 namespace ue2 {
 
 class NGHolder;
@@ -65,6 +68,13 @@ bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
 u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
                                 u32 max_delay, bool overhang_ok = true);
 
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay);
+
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay,
+                                  const std::vector<NFAVertex> &preds);
+
 } // namespace ue2
 
 #endif // NG_ROSE_H
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 03a612a0..ed2942bb 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -384,7 +384,7 @@ makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
     add_edge(prefix.accept, prefix.acceptEod, prefix);
 
     assert(!next_enters.empty());
-    assert(next_enters.front() != NFAGraph::null_vertex());
+    assert(next_enters.front() != NGHolder::null_vertex());
     u32 dead_region = regions.at(next_enters.front());
     DEBUG_PRINTF("curr_region %u, dead_region %u\n",
                  regions.at(curr_exits.front()), dead_region);
@@ -2064,8 +2064,7 @@ sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
     ReportManager &rm = ng.rm;
     SomSlotManager &ssm = ng.ssm;
 
-    // This approach relies on Rose.
-    if (!cc.grey.allowRose) {
+    if (!cc.grey.allowHaigLit) {
         return SOMBE_FAIL;
     }
 
@@ -2537,7 +2536,7 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
     RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
 
     bool lhs_all_vac = true;
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     for (tie(ai, ae) = adjacent_vertices(lhs->startDs, *lhs);
          ai != ae && lhs_all_vac; ++ai) {
         if (!is_special(*ai, *lhs)) {
diff --git a/src/nfagraph/ng_split.cpp b/src/nfagraph/ng_split.cpp
index 42157e1e..bce638c0 100644
--- a/src/nfagraph/ng_split.cpp
+++ b/src/nfagraph/ng_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,12 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
         add_edge((*lhs_map)[pivot], lhs->accept, *lhs);
     }
 
-    pruneUseless(*lhs);
+    /* should do the renumbering unconditionally as we know edges are already
+     * misnumbered */
+    pruneUseless(*lhs, false);
+    renumber_edges(*lhs);
+    renumber_vertices(*lhs);
+
     filterSplitMap(*lhs, lhs_map);
 
     switch (base.kind) {
@@ -112,6 +117,12 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
     case NFA_SUFFIX:
         lhs->kind = NFA_INFIX;
         break;
+    case NFA_EAGER_PREFIX:
+        /* Current code should not be assigning eager until well after all the
+         * splitting is done. */
+        assert(0);
+        lhs->kind = NFA_EAGER_PREFIX;
+        break;
     case NFA_REV_PREFIX:
     case NFA_OUTFIX_RAW:
         assert(0);
@@ -142,7 +153,12 @@ void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
         assert(contains(*rhs_map, pivot));
         add_edge(rhs->start, (*rhs_map)[pivot], *rhs);
     }
-    pruneUseless(*rhs);
+
+     /* should do the renumbering unconditionally as we know edges are already
+      * misnumbered */
+    pruneUseless(*rhs, false);
+    renumber_edges(*rhs);
+    renumber_vertices(*rhs);
     filterSplitMap(*rhs, rhs_map);
 
     switch (base.kind) {
@@ -154,6 +170,12 @@ void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
     case NFA_OUTFIX:
         rhs->kind = NFA_SUFFIX;
         break;
+    case NFA_EAGER_PREFIX:
+        /* Current code should not be assigning eager until well after all the
+         * splitting is done. */
+        assert(0);
+        rhs->kind = NFA_INFIX;
+        break;
     case NFA_REV_PREFIX:
     case NFA_OUTFIX_RAW:
         assert(0);
diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index dd3693e5..6577673f 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -369,7 +369,7 @@ map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
     buildPDomTree(g, pdom_tree);
 
     // Build list of vertices by state ID and a set of init states.
-    vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
+    vector<NFAVertex> vByIndex(numStates, NGHolder::null_vertex());
     NFAStateSet initStates(numStates);
     smgb_cache cache(g);
 
@@ -394,7 +394,7 @@ map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
 
     for (u32 i = 0; i < numStates; i++) {
         NFAVertex v = vByIndex[i];
-        assert(v != NFAGraph::null_vertex());
+        assert(v != NGHolder::null_vertex());
         const CharReach &cr = g[v].char_reach;
 
         /* only non-init cyclics can be squashers */
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index abba09f9..217183de 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -184,7 +184,7 @@ u32 commonPrefixLength(const NGHolder &ga,
             size_t a_count = 0;
             size_t b_count = 0;
 
-            NFAGraph::out_edge_iterator ei, ee;
+            NGHolder::out_edge_iterator ei, ee;
             for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) {
                 u32 sid = a_state_ids.at(target(*ei, ga));
                 if (sid == NO_STATE || sid >= max) {
@@ -213,7 +213,7 @@ u32 commonPrefixLength(const NGHolder &ga,
                 }
             }
 
-            NFAGraph::adjacency_iterator ai, ae;
+            NGHolder::adjacency_iterator ai, ae;
             for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae;
                  ++ai) {
                 u32 sid = b_state_ids.at(*ai);
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index bcf0ce29..c629d553 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,26 +78,26 @@ depth maxDistFromStartOfData(const NFAVertexDepth &vd) {
 }
 
 NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex a) {
-    assert(a != NFAGraph::null_vertex());
+    assert(a != NGHolder::null_vertex());
 
-    NFAGraph::out_edge_iterator ii, iie;
+    NGHolder::out_edge_iterator ii, iie;
     tie(ii, iie) = out_edges(a, g);
     if (ii == iie) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
     NFAVertex b = target(*ii, g);
     if (a == b) {
         ++ii;
         if (ii == iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
 
         b = target(*ii, g);
         if (++ii != iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
     } else if (++ii != iie && (target(*ii, g) != a || ++ii != iie)) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     assert(a != b);
@@ -105,23 +105,23 @@ NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex a) {
 }
 
 NFAVertex getSoleSourceVertex(const NGHolder &g, NFAVertex a) {
-    assert(a != NFAGraph::null_vertex());
+    assert(a != NGHolder::null_vertex());
 
     u32 idegree = in_degree(a, g);
     if (idegree != 1 && !(idegree == 2 && hasSelfLoop(a, g))) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
-    NFAGraph::in_edge_iterator ii, iie;
+    NGHolder::in_edge_iterator ii, iie;
     tie(ii, iie) = in_edges(a, g);
     if (ii == iie) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
     NFAVertex b = source(*ii, g);
     if (a == b) {
         ++ii;
         if (ii == iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
 
         b = source(*ii, g);
@@ -209,6 +209,15 @@ bool isAnchored(const NGHolder &g) {
     return true;
 }
 
+bool isFloating(const NGHolder &g) {
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        if (v != g.startDs && !edge(g.startDs, v, g).second) {
+            return false;
+        }
+    }
+    return true;
+}
+
 bool isAcyclic(const NGHolder &g) {
     try {
         depth_first_search(
@@ -321,7 +330,7 @@ bool can_match_at_eod(const NGHolder &h) {
 }
 
 bool can_only_match_at_eod(const NGHolder &g) {
-    NFAGraph::in_edge_iterator ie, ee;
+    NGHolder::in_edge_iterator ie, ee;
     tie(ie, ee) = in_edges(g.accept, g);
 
     return ie == ee;
@@ -622,16 +631,18 @@ unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
 }
 
 #ifndef NDEBUG
-/** \brief Used in sanity-checking assertions: returns true if all vertices
- * leading to accept or acceptEod have at least one report ID. */
+
 bool allMatchStatesHaveReports(const NGHolder &g) {
+    unordered_set<NFAVertex> reporters;
     for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
         if (g[v].reports.empty()) {
             DEBUG_PRINTF("vertex %u has no reports!\n",
                          g[v].index);
             return false;
         }
+        reporters.insert(v);
     }
+
     for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
         if (v == g.accept) {
             continue; // stylised edge
@@ -641,12 +652,20 @@ bool allMatchStatesHaveReports(const NGHolder &g) {
                          g[v].index);
             return false;
         }
+        reporters.insert(v);
     }
+
+    for (auto v : vertices_range(g)) {
+        if (!contains(reporters, v) && !g[v].reports.empty()) {
+            DEBUG_PRINTF("vertex %u is not a match state, but has reports!\n",
+                         g[v].index);
+            return false;
+        }
+    }
+
     return true;
 }
 
-/** Assertion: returns true if the vertices in this graph are contiguously (and
- * uniquely) numbered from zero. */
 bool hasCorrectlyNumberedVertices(const NGHolder &g) {
     size_t count = num_vertices(g);
     vector<bool> ids(count, false);
@@ -657,11 +676,10 @@ bool hasCorrectlyNumberedVertices(const NGHolder &g) {
         }
         ids[id] = true;
     }
-    return find(ids.begin(), ids.end(), false) == ids.end();
+    return find(ids.begin(), ids.end(), false) == ids.end()
+        && num_vertices(g) == num_vertices(g.g);
 }
 
-/** Assertion: returns true if the edges in this graph are contiguously (and
- * uniquely) numbered from zero. */
 bool hasCorrectlyNumberedEdges(const NGHolder &g) {
     size_t count = num_edges(g);
     vector<bool> ids(count, false);
@@ -672,8 +690,10 @@ bool hasCorrectlyNumberedEdges(const NGHolder &g) {
         }
         ids[id] = true;
     }
-    return find(ids.begin(), ids.end(), false) == ids.end();
+    return find(ids.begin(), ids.end(), false) == ids.end()
+        && num_edges(g) == num_edges(g.g);
 }
+
 #endif // NDEBUG
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 9eb621e8..4f58dc45 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,7 +65,7 @@ bool is_dot(NFAVertex v, const GraphT &g) {
 template<class U>
 static really_inline
 void succ(const NGHolder &g, NFAVertex v, U *s) {
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     tie(ai, ae) = adjacent_vertices(v, g);
     s->insert(ai, ae);
 }
@@ -74,14 +74,14 @@ void succ(const NGHolder &g, NFAVertex v, U *s) {
 template<class U>
 static really_inline
 void pred(const NGHolder &g, NFAVertex v, U *p) {
-    NFAGraph::inv_adjacency_iterator it, ite;
+    NGHolder::inv_adjacency_iterator it, ite;
     tie(it, ite) = inv_adjacent_vertices(v, g);
     p->insert(it, ite);
 }
 
 /** returns a vertex with an out edge from v and is not v.
  * v must have exactly one out-edge excluding self-loops.
- * will return NFAGraph::null_vertex() if the preconditions don't hold.
+ * will return NGHolder::null_vertex() if the preconditions don't hold.
  */
 NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex v);
 
@@ -228,6 +228,10 @@ bool isVacuous(const NGHolder &h);
  * proper successors). */
 bool isAnchored(const NGHolder &h);
 
+/** \brief True if the graph contains no anchored vertices (start has no
+ * successors aside from startDs or vertices connected to startDs). */
+bool isFloating(const NGHolder &h);
+
 /** True if the graph contains no back-edges at all, other than the
  * startDs self-loop. */
 bool isAcyclic(const NGHolder &g);
@@ -293,15 +297,29 @@ void clearReports(NGHolder &g);
 void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 
 #ifndef NDEBUG
-// Assertions: only available in internal builds
 
-/** \brief Used in sanity-checking assertions: returns true if all vertices
- * leading to accept or acceptEod have at least one report ID. */
+// Assertions: only available in internal builds.
+
+/**
+ * Used in sanity-checking assertions: returns true if all vertices
+ * with edges to accept or acceptEod have at least one report ID. Additionally,
+ * checks that ONLY vertices with edges to accept or acceptEod has reports.
+ */
 bool allMatchStatesHaveReports(const NGHolder &g);
 
+/**
+ * Assertion: returns true if the vertices in this graph are contiguously (and
+ * uniquely) numbered from zero.
+ */
 bool hasCorrectlyNumberedVertices(const NGHolder &g);
+
+/**
+ * Assertion: returns true if the edges in this graph are contiguously (and
+ * uniquely) numbered from zero.
+ */
 bool hasCorrectlyNumberedEdges(const NGHolder &g);
-#endif
+
+#endif // NDEBUG
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
new file mode 100644
index 00000000..94e0a998
--- /dev/null
+++ b/src/nfagraph/ng_violet.cpp
@@ -0,0 +1,2661 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_violet.h"
+
+#include "grey.h"
+#include "ng_depth.h"
+#include "ng_dominators.h"
+#include "ng_dump.h"
+#include "ng_equivalence.h"
+#include "ng_holder.h"
+#include "ng_is_equal.h"
+#include "ng_literal_analysis.h"
+#include "ng_netflow.h"
+#include "ng_prune.h"
+#include "ng_redundancy.h"
+#include "ng_region.h"
+#include "ng_reports.h"
+#include "ng_rose.h"
+#include "ng_split.h"
+#include "ng_util.h"
+#include "ng_width.h"
+#include "rose/rose_build.h"
+#include "rose/rose_build_util.h"
+#include "rose/rose_in_dump.h"
+#include "rose/rose_in_graph.h"
+#include "rose/rose_in_util.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/ue2_containers.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+#include <boost/core/noncopyable.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
+
+using namespace std;
+using boost::adaptors::map_values;
+
+namespace ue2 {
+
+/* createsAnchoredLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+                        const vector<NFAVertexDepth> &depths,
+                        const Grey &grey, depth max_depth = depth::infinity()) {
+    max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
+
+    for (auto v : vv) {
+        /* avoid issues of self loops blowing out depths:
+         *     look at preds, add 1 */
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == v) {
+                continue;
+            }
+
+            u32 idx = g[u].index;
+            assert(idx < depths.size());
+            if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/* createsTransientLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+                         const vector<NFAVertexDepth> &depths,
+                         const Grey &grey) {
+    const depth max_depth(grey.maxHistoryAvailable);
+
+    for (auto v : vv) {
+        /* avoid issues of self loops blowing out depths:
+         *     look at preds, add 1 */
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == v) {
+                continue;
+            }
+
+            u32 idx = g[u].index;
+            assert(idx < depths.size());
+            if (maxDistFromInit(depths.at(idx)) >= max_depth) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+namespace {
+/**
+ * Information on a cut: vertices and literals.
+ */
+struct VertLitInfo {
+    VertLitInfo() {}
+    VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
+                bool c_tran = false)
+        : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
+          creates_transient(c_tran) {}
+    VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
+                bool c_anch)
+        : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
+    vector<NFAVertex> vv;
+    set<ue2_literal> lit;
+
+    bool creates_anchored = false;
+    bool creates_transient = false;
+};
+
+/**
+ * \brief Comparator class for sorting LitCollection::lits.
+ *
+ * This is separated out from LitCollection itself as passing LitCollection to
+ * std::sort() would incur a (potentially expensive) copy.
+ */
+class LitComparator {
+public:
+    LitComparator(const NGHolder &g_in, bool sa, bool st)
+        : g(g_in), seeking_anchored(sa), seeking_transient(st) {}
+    bool operator()(const unique_ptr<VertLitInfo> &a,
+                    const unique_ptr<VertLitInfo> &b) const {
+        assert(a && b);
+
+        if (seeking_anchored) {
+            if (a->creates_anchored != b->creates_anchored) {
+                return a->creates_anchored < b->creates_anchored;
+            }
+        }
+
+        if (seeking_transient) {
+            if (a->creates_transient != b->creates_transient) {
+                return a->creates_transient < b->creates_transient;
+            }
+        }
+
+        u64a score_a = scoreSet(a->lit);
+        u64a score_b = scoreSet(b->lit);
+
+        if (score_a != score_b) {
+            return score_a > score_b;
+        }
+
+        /* vertices should only be in one candidate cut */
+        assert(a->vv == b->vv || a->vv.front() != b->vv.front());
+        return g[a->vv.front()].index > g[b->vv.front()].index;
+    }
+
+private:
+    const NGHolder &g; /**< graph on which cuts are found */
+
+    bool seeking_anchored;
+    bool seeking_transient;
+};
+}
+
+static
+size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
+    size_t count = 0;
+
+    for (const auto &lit : s) {
+        if (lit.length() < limit) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+static
+u32 min_len(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)lit.length());
+    }
+
+    return rv;
+}
+
+static
+u32 min_period(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)minStringPeriod(lit));
+    }
+    DEBUG_PRINTF("min period %u\n", rv);
+    return rv;
+}
+
+#define MIN_ANCHORED_LEN 2
+
+static
+bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
+                                   bool anchored, u32 min_allowed_floating_len,
+                                   bool desperation) {
+    u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
+                                   : min_allowed_floating_len;
+
+    assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
+
+    if (score >= NO_LITERAL_AT_EDGE_SCORE) {
+        DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
+        return false;
+    }
+
+    assert(!s.empty());
+    if (s.empty()) {
+        DEBUG_PRINTF("candidate is too bad/something went wrong\n");
+        return false;
+    }
+
+    u32 s_min_len = min_len(s);
+    u32 s_min_period = min_period(s);
+    size_t short_count = shorter_than(s, 5);
+
+    DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
+                 " short_count=%zu desp=%d\n",
+                 dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
+                 s_min_period, short_count, (int)desperation);
+
+    bool ok = true;
+
+    if (s.size() > 10 /* magic number is magic */
+        || s_min_len < min_allowed_len
+        || (s_min_period <= 1 && min_allowed_len != 1)) {
+        ok = false;
+    }
+
+    if (!ok && desperation
+        && s.size() <= 20 /* more magic numbers are magical */
+        && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
+        && s_min_period > 1) {
+        DEBUG_PRINTF("candidate is ok\n");
+        ok = true;
+    }
+
+    if (!ok && desperation
+        && s.size() <= 50 /* more magic numbers are magical */
+        && s_min_len > 10
+        && s_min_period > 1) {
+        DEBUG_PRINTF("candidate is ok\n");
+        ok = true;
+    }
+
+    if (!ok) {
+        DEBUG_PRINTF("candidate is too shitty\n");
+        return false;
+    }
+
+    return true;
+}
+
+static UNUSED
+void dumpRoseLiteralSet(const set<ue2_literal> &s) {
+    for (UNUSED const auto &lit : s) {
+        DEBUG_PRINTF("    lit: %s\n", dumpString(lit).c_str());
+    }
+}
+
+static
+void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
+                           const vector<NFAVertexDepth> *depths,
+                           const set<NFAVertex> &a_dom,
+                           vector<unique_ptr<VertLitInfo>> *lits,
+                           u32 min_allowed_len, bool desperation,
+                           const CompileContext &cc) {
+    assert(depths || !seeking_anchored);
+
+    map<NFAVertex, u64a> scores;
+    map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
+    set<ue2_literal> s;
+
+    for (auto v : a_dom) {
+        s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
+                                          revisits to the target vertex */
+
+        if (s.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+        u64a score = sanitizeAndCompressAndScore(s);
+
+        bool anchored = false;
+        if (seeking_anchored) {
+            anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
+        }
+
+        if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+                                           desperation)) {
+            continue;
+        }
+
+        DEBUG_PRINTF("candidate is a candidate\n");
+        scores[v] = score;
+        lit_info[v] = make_unique<VertLitInfo>(v, s, anchored);
+    }
+
+    /* try to filter out cases where appending some characters produces worse
+     * literals. Only bother to look back one byte, TODO make better */
+    for (auto u : a_dom) {
+        if (out_degree(u, g) != 1 || !scores[u]) {
+            continue;
+        }
+        NFAVertex v = *adjacent_vertices(u, g).first;
+        if (contains(scores, v) && scores[v] >= scores[u]) {
+            DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
+                         scores[v], scores[u]);
+            lit_info.erase(v);
+        }
+    }
+
+    lits->reserve(lit_info.size());
+    for (auto &m : lit_info) {
+        lits->push_back(move(m.second));
+    }
+    DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
+}
+
+static
+void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
+                           const vector<NFAVertexDepth> *depths,
+                           const set<NFAVertex> &bad,
+                           const set<NFAVertex> *allowed,
+                           vector<unique_ptr<VertLitInfo>> *lits,
+                           u32 min_allowed_len, bool desperation,
+                           const CompileContext &cc) {
+    /* This allows us to get more places to split the graph as we are not
+       limited to points where there is a single vertex to split at. */
+
+    assert(depths || !seeking_anchored);
+
+    /* TODO: operate over 'proto-regions' which ignore back edges */
+    auto regions = assignRegions(g);
+
+    set<u32> mand, optional;
+    map<u32, vector<NFAVertex> > exits;
+
+    for (auto v : vertices_range(g)) {
+        u32 region = regions[v];
+        if (is_any_start(v, g) || region == 0) {
+            continue;
+        }
+
+        if (is_any_accept(v, g)) {
+            continue;
+        }
+
+        if (!generates_callbacks(g) && is_match_vertex(v, g)) {
+            /* we cannot leave a completely vacuous infix */
+            continue;
+        }
+
+        if (isRegionExit(g, v, regions)) {
+            exits[region].push_back(v);
+        }
+
+        if (isRegionEntry(g, v, regions)) {
+            // Determine whether this region is mandatory or optional. We only
+            // need to do this check for the first entry vertex we encounter
+            // for this region.
+            if (!contains(mand, region) && !contains(optional, region)) {
+                if (isOptionalRegion(g, v, regions)) {
+                    optional.insert(region);
+                } else {
+                    mand.insert(region);
+                }
+            }
+        }
+    }
+
+    for (const auto &m : exits) {
+        if (false) {
+        next_cand:
+            continue;
+        }
+
+        const u32 region = m.first;
+        const vector<NFAVertex> &vv = m.second;
+        assert(!vv.empty());
+
+        if (!contains(mand, region)) {
+            continue;
+        }
+
+        for (auto v : vv) {
+             /* if an exit is in bad, the region is already handled well
+              * by getSimpleRoseLiterals or is otherwise bad */
+            if (contains(bad, v)) {
+                goto next_cand;
+            }
+            /* if we are only allowed to consider some vertices, v must be in
+               the list; */
+            if (allowed && !contains(*allowed, v)) {
+                goto next_cand;
+            }
+        }
+
+        /* the final region may not have a neat exit. validate that all exits
+         * have an edge to each accept or none do */
+        bool edge_to_a = edge(vv[0], g.accept, g).second;
+        bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
+        const auto &reports = g[vv[0]].reports;
+        for (auto v : vv) {
+            if (edge_to_a != edge(v, g.accept, g).second) {
+                goto next_cand;
+            }
+
+            if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
+                goto next_cand;
+            }
+
+            if (g[v].reports != reports) {
+                goto next_cand;
+            }
+        }
+
+        DEBUG_PRINTF("inspecting region %u\n", region);
+        set<ue2_literal> s;
+        for (auto v : vv) {
+            DEBUG_PRINTF("   exit vertex: %u\n", g[v].index);
+            /* Note: RHS can not be depended on to take all subsequent revisits
+             * to this vertex */
+            set<ue2_literal> ss = getLiteralSet(g, v, false);
+            if (ss.empty()) {
+                DEBUG_PRINTF("candidate is too shitty\n");
+                goto next_cand;
+            }
+            insert(&s, ss);
+        }
+
+        assert(!s.empty());
+
+        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+        u64a score = sanitizeAndCompressAndScore(s);
+
+        DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+
+        bool anchored = false;
+        if (seeking_anchored) {
+            anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
+        }
+
+        if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+                                           desperation)) {
+            goto next_cand;
+        }
+
+        DEBUG_PRINTF("candidate is a candidate\n");
+        lits->push_back(make_unique<VertLitInfo>(vv, s, anchored));
+    }
+}
+
+static
+void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
+                      set<NFAVertex> *out) {
+    for (auto u : cand_raw) {
+        const CharReach &u_cr = g[u].char_reach;
+        if (u_cr.count() > 40) {
+            continue; /* too wide to be plausible */
+        }
+
+        if (u_cr.count() > 2) {
+            /* include u as a candidate as successor may have backed away from
+             * expanding through it */
+            out->insert(u);
+            continue;
+        }
+
+        NFAVertex v = getSoleDestVertex(g, u);
+        if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
+            const CharReach &v_cr = g[v].char_reach;
+            if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+                continue; /* v will always generate better literals */
+            }
+        }
+
+        out->insert(u);
+    }
+}
+
+/* cand_raw is the candidate set before filtering points which are clearly
+ * a bad idea. */
+static
+void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
+                        set<NFAVertex> *cand_raw) {
+    ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
+
+    set<NFAVertex> accepts;
+
+    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+        accepts.insert(v);
+    }
+    for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+        accepts.insert(v);
+    }
+
+    assert(!accepts.empty());
+
+    vector<NFAVertex> dom_trace;
+    auto ait = accepts.begin();
+    assert(ait != accepts.end());
+    NFAVertex curr = *ait;
+    while (curr && !is_special(curr, g)) {
+        dom_trace.push_back(curr);
+        curr = dominators[curr];
+    }
+    reverse(dom_trace.begin(), dom_trace.end());
+    for (++ait; ait != accepts.end(); ++ait) {
+        curr = *ait;
+        vector<NFAVertex> dom_trace2;
+        while (curr && !is_special(curr, g)) {
+            dom_trace2.push_back(curr);
+            curr = dominators[curr];
+        }
+        reverse(dom_trace2.begin(), dom_trace2.end());
+        auto dti = dom_trace.begin(), dtie = dom_trace.end();
+        auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
+        while (dti != dtie && dtj != dtje && *dti == *dtj) {
+            ++dti;
+            ++dtj;
+        }
+        dom_trace.erase(dti, dtie);
+    }
+
+    cand_raw->insert(dom_trace.begin(), dom_trace.end());
+
+    filterCandPivots(g, *cand_raw, cand);
+}
+
+static
+unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
+                                      const vector<NFAVertexDepth> *depths,
+                                      bool for_prefix, u32 min_len,
+                                      const set<NFAVertex> *allowed_cand,
+                                      const set<NFAVertex> *disallowed_cand,
+                                      const CompileContext &cc) {
+    assert(!for_prefix || depths);
+
+    /* look for a single simple split point */
+    set<NFAVertex> cand;
+    set<NFAVertex> cand_raw;
+
+    getCandidatePivots(g, &cand, &cand_raw);
+
+    if (allowed_cand) {
+        set<NFAVertex> cand2;
+        set<NFAVertex> cand2_raw;
+        set_intersection(allowed_cand->begin(), allowed_cand->end(),
+                         cand.begin(), cand.end(),
+                         inserter(cand2, cand2.begin()));
+
+        set_intersection(allowed_cand->begin(), allowed_cand->end(),
+                         cand_raw.begin(), cand_raw.end(),
+                         inserter(cand2_raw, cand2_raw.begin()));
+
+        cand = std::move(cand2);
+        cand_raw = std::move(cand2_raw);
+    }
+    if (disallowed_cand) {
+        DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
+        DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
+        erase_all(&cand, *disallowed_cand);
+        insert(&cand_raw, *disallowed_cand);
+    }
+
+    if (!generates_callbacks(g)) {
+        /* not output exposed so must leave some RHS */
+        for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+            cand.erase(v);
+            cand_raw.erase(v);
+        }
+
+        for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+            cand.erase(v);
+            cand_raw.erase(v);
+        }
+    }
+
+    DEBUG_PRINTF("|cand| = %zu\n", cand.size());
+
+    bool seeking_anchored = for_prefix;
+    bool seeking_transient = for_prefix; //cc.streaming;
+
+    /* TODO: revisit when backstop goes away */
+    bool desperation = for_prefix && cc.streaming;
+
+    vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
+
+    getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
+                          desperation, cc);
+    getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
+                          &lits, min_len, desperation, cc);
+
+    if (lits.empty()) {
+        DEBUG_PRINTF("no literals found\n");
+        return nullptr;
+    }
+
+    if (seeking_transient) {
+        for (auto &a : lits) {
+            a->creates_transient
+                = createsTransientLHS(g, a->vv, *depths, cc.grey);
+        }
+    }
+
+    auto cmp = LitComparator(g, seeking_anchored, seeking_transient);
+
+    unique_ptr<VertLitInfo> best = move(lits.back());
+    lits.pop_back();
+    while (!lits.empty()) {
+        if (cmp(best, lits.back())) {
+            best = move(lits.back());
+        }
+        lits.pop_back();
+    }
+
+    DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
+        dumpString(*best->lit.begin()).c_str(),
+        g[best->vv.front()].index,
+        depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
+        depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
+
+    return best;
+}
+
+static
+void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
+                         bool overhang_ok, flat_set<NFAEdge> &bad) {
+    DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
+                 num_vertices(h), succ.length());
+
+    map<NFAVertex, flat_set<NFAEdge> > curr;
+    for (const auto &e : in_edges_range(h.accept, h)) {
+        curr[source(e, h)].insert(e);
+    }
+
+    map<NFAVertex, flat_set<NFAEdge> > next;
+    for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
+        for (const auto &path : curr) {
+            NFAVertex u = path.first;
+            const auto &path_set = path.second;
+            if (u == h.start && overhang_ok) {
+                DEBUG_PRINTF("poisoning early %zu [overhang]\n",
+                             path_set.size());
+                insert(&bad, path_set);
+                continue;
+            }
+            if (overlaps(h[u].char_reach, *it)) {
+                for (const auto &e : in_edges_range(u, h)) {
+                    auto &new_path_set = next[source(e, h)];
+                    insert(&new_path_set, path_set);
+                    new_path_set.insert(e);
+                }
+            }
+        }
+        DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
+        assert(overhang_ok || !curr.empty());
+        swap(curr, next);
+        next.clear();
+    }
+
+    assert(overhang_ok || !curr.empty());
+    for (const auto &path : curr) {
+        insert(&bad, path.second);
+        DEBUG_PRINTF("poisoning %zu vertices\n", path.second.size());
+    }
+}
+
+static
+void poisonForGoodPrefix(const NGHolder &h,
+                         const vector<NFAVertexDepth> &depths,
+                         flat_set<NFAEdge> &bad, const Grey &grey) {
+    for (const auto &v : vertices_range(h)) {
+        if (!createsAnchoredLHS(h, {v}, depths, grey)
+            && !createsTransientLHS(h, {v}, depths, grey)) {
+            insert(&bad, in_edges_range(v, h));
+        }
+    }
+}
+
+static
+flat_set<NFAEdge> poisonEdges(const NGHolder &h,
+                         const vector<NFAVertexDepth> *depths,
+                         const RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                         bool for_prefix, const Grey &grey) {
+    DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
+
+    /* poison edges covered by successor literal */
+
+    set<pair<ue2_literal, bool> > succs;
+    for (const RoseInEdge &ve : ee) {
+        if (vg[target(ve, vg)].type != RIV_LITERAL) {
+            /* nothing to poison in suffixes/outfixes */
+            assert(vg[target(ve, vg)].type == RIV_ACCEPT);
+            continue;
+        }
+        succs.insert({vg[target(ve, vg)].s,
+                    vg[source(ve, vg)].type == RIV_LITERAL});
+
+    }
+
+    DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
+
+    flat_set<NFAEdge> bad;
+    for (const auto &p : succs) {
+        poisonFromSuccessor(h, p.first, p.second, bad);
+    }
+
+    /* poison edges which don't significantly improve a prefix */
+
+    if (for_prefix) {
+        poisonForGoodPrefix(h, *depths, bad, grey);
+    }
+
+    return bad;
+}
+
+static
+set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
+                              const vector<RoseInEdge> &ee, const Grey &grey) {
+    flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
+    set<NFAVertex> bad_vertices;
+    for (const NFAEdge &e : bad_edges) {
+        bad_vertices.insert(target(e, h));
+        DEBUG_PRINTF("bad: %u->%u\n", h[source(e, h)].index,
+                     h[target(e, h)].index);
+    }
+
+    return bad_vertices;
+}
+
+static
+unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
+                                            const RoseInGraph &vg,
+                                            const vector<RoseInEdge> &ee,
+                                            const CompileContext &cc) {
+    assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+    set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+    return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+                         nullptr, &bad_vertices, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
+                                              const CompileContext &cc) {
+    DEBUG_PRINTF("looking for simple prefix split\n");
+    bool anchored = !proper_out_degree(g.startDs, g);
+    NFAVertex u = anchored ? g.start : g.startDs;
+
+    if (out_degree(u, g) != 2) { /* startDs + succ */
+        return nullptr;
+    }
+
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(u, g)) {
+        if (t != g.startDs) {
+            assert(!v);
+            v = t;
+        }
+    }
+    assert(v);
+
+    if (!anchored) {
+        if (out_degree(g.start, g) > 2) {
+            return nullptr;
+        }
+        if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
+            return nullptr;
+        }
+    }
+
+    NFAVertex best_v = NGHolder::null_vertex();
+    ue2_literal best_lit;
+
+    u32 limit = cc.grey.maxHistoryAvailable;
+    if (anchored) {
+        LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
+    }
+
+    ue2_literal curr_lit;
+    for (u32 i = 0; i < limit; i++) {
+        const auto &v_cr = g[v].char_reach;
+        if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+            curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
+        } else {
+            curr_lit.clear();
+        }
+
+        if (curr_lit.length() > best_lit.length()) {
+            best_lit = curr_lit;
+            best_v = v;
+        }
+
+        if (out_degree(v, g) != 1) {
+            break;
+        }
+        v = *adjacent_vertices(v, g).first;
+    }
+
+    if (best_lit.length() < cc.grey.minRoseLiteralLength) {
+        return nullptr;
+    }
+
+    set<ue2_literal> best_lit_set({best_lit});
+    if (bad_mixed_sensitivity(best_lit)) {
+        sanitizeAndCompressAndScore(best_lit_set);
+    }
+
+    return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
+}
+
+static
+unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
+                                        const vector<NFAVertexDepth> &depths,
+                                        const RoseInGraph &vg,
+                                        const vector<RoseInEdge> &ee,
+                                        const CompileContext &cc) {
+    assert(g.kind == NFA_PREFIX);
+    set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+    auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
+                            nullptr, &bad_vertices, cc);
+
+    /* large back edges may prevent us identifying anchored or transient cases
+     * properly - use a simple walk instead */
+    if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
+        auto rv2 = findSimplePrefixSplit(g, cc);
+        if (rv2) {
+            return rv2;
+        }
+    }
+
+    return rv;
+}
+
+static
+unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
+                                           const CompileContext &cc) {
+    assert(g.kind != NFA_PREFIX);
+    set<NFAVertex> cleanSplits;
+    for (NFAVertex v : vertices_range(g)) {
+        if (!g[v].char_reach.all() || !edge(v, v, g).second) {
+            continue;
+        }
+        insert(&cleanSplits, inv_adjacent_vertices(v, g));
+        cleanSplits.erase(v);
+    }
+    cleanSplits.erase(g.start);
+    if (cleanSplits.empty()) {
+        return nullptr;
+    }
+    return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
+                         &cleanSplits, nullptr, cc);
+}
+
+static
+bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
+    set<NFAVertex> curr, next;
+    curr.insert(g.accept);
+
+    for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
+        next.clear();
+
+        for (auto v : curr) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                if (u == g.start) {
+                    if (overhang_ok) {
+                        DEBUG_PRINTF("bail\n");
+                        return true;
+                    } else {
+                        continue; /* it is not possible for a lhs literal to
+                                   * overhang the start */
+                    }
+                }
+
+                const CharReach &cr = g[u].char_reach;
+                if (!overlaps(*it, cr)) {
+                     continue;
+                }
+
+                next.insert(u);
+            }
+        }
+
+        curr.swap(next);
+    }
+
+    return !curr.empty();
+}
+
+static
+bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
+                   const vector<RoseInEdge> &ee, const VertLitInfo &split) {
+    const vector<NFAVertex> &splitters = split.vv;
+    assert(!splitters.empty());
+
+    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+
+    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
+    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+
+    splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
+    DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
+                 to_string(base_graph.kind).c_str(), num_vertices(base_graph),
+                 to_string(lhs->kind).c_str(), num_vertices(*lhs),
+                 to_string(rhs->kind).c_str(), num_vertices(*rhs));
+
+    bool suffix = vg[target(ee.front(), vg)].type == RIV_ACCEPT;
+
+    if (is_triggered(base_graph)) {
+        /* if we are already guarded, check if the split reduces the size of
+         * the problem before continuing with the split */
+        if (num_vertices(*lhs) >= num_vertices(base_graph)
+            && !(suffix && isVacuous(*rhs))) {
+            DEBUG_PRINTF("split's lhs is no smaller\n");
+            return false;
+        }
+
+        if (num_vertices(*rhs) >= num_vertices(base_graph)) {
+            DEBUG_PRINTF("split's rhs is no smaller\n");
+            return false;
+        }
+    }
+
+    bool do_accept = false;
+    bool do_accept_eod = false;
+    assert(rhs);
+    if (isVacuous(*rhs) && suffix) {
+        if (edge(rhs->start, rhs->accept, *rhs).second) {
+            DEBUG_PRINTF("rhs has a cliche\n");
+            do_accept = true;
+            remove_edge(rhs->start, rhs->accept, *rhs);
+        }
+
+        if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
+            DEBUG_PRINTF("rhs has an eod cliche\n");
+            do_accept_eod = true;
+            remove_edge(rhs->start, rhs->acceptEod, *rhs);
+        }
+
+        renumber_edges(*rhs);
+    }
+
+    /* check if we still have a useful graph left over */
+    bool do_norm = out_degree(rhs->start, *rhs) != 1;
+
+    set<ReportID> splitter_reports;
+    for (auto v : splitters) {
+        insert(&splitter_reports, base_graph[v].reports);
+    }
+
+    /* find the targets of each source vertex; note the use of vectors to
+     * preserve deterministic ordering */
+    vector<RoseInVertex> sources;
+    map<RoseInVertex, vector<RoseInVertex>> images;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex src = source(e, vg);
+        RoseInVertex dest = target(e, vg);
+        if (!contains(images, src)) {
+            sources.push_back(src);
+        }
+        images[src].push_back(dest);
+        remove_edge(e, vg);
+    }
+
+    map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
+
+    for (const auto &u : sources) {
+        const auto &image = images[u];
+
+        if (contains(verts_by_image, image)) {
+            for (RoseInVertex v : verts_by_image[image]) {
+                add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+            }
+            continue;
+        }
+
+        for (const auto &lit : split.lit) {
+            assert(!bad_mixed_sensitivity(lit));
+
+            /* don't allow overhang in can_match() as literals should
+             * correspond to the edge graph being split; overhanging the graph
+             * would indicate a false path.*/
+            if (!can_match(*lhs, lit, false)) {
+                DEBUG_PRINTF("'%s' did not match lhs\n",
+                             escapeString(lit).c_str());
+                continue;
+            }
+
+            DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+            auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+            add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+
+            /* work out delay later */
+            if (do_accept) {
+                DEBUG_PRINTF("rhs has a cliche\n");
+                auto tt = add_vertex(RoseInVertexProps::makeAccept(
+                                                         splitter_reports), vg);
+                add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+            }
+
+            if (do_accept_eod) {
+                DEBUG_PRINTF("rhs has an eod cliche\n");
+                auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
+                                                         splitter_reports), vg);
+                add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+            }
+
+            if (do_norm) {
+                assert(out_degree(rhs->start, *rhs) > 1);
+                for (RoseInVertex dest : image) {
+                    add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
+                }
+            }
+            verts_by_image[image].push_back(v);
+        }
+    }
+
+    assert(hasCorrectlyNumberedVertices(*rhs));
+    assert(hasCorrectlyNumberedEdges(*rhs));
+    assert(hasCorrectlyNumberedVertices(*lhs));
+    assert(hasCorrectlyNumberedEdges(*lhs));
+
+    return true;
+}
+
+#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
+#define MAX_LEN_2_LITERALS_PER_CUT 3
+
+static
+bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
+                           const map<NFAEdge, set<ue2_literal>> &cut_lits,
+                           u32 min_allowed_length) {
+    DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
+                 min_allowed_length);
+    if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
+        return false;
+    }
+
+    u32 len_2_count = 0;
+
+    for (const auto &cut : cut_lits) {
+        if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
+            DEBUG_PRINTF("cut uses a forbidden edge\n");
+            return false;
+        }
+
+        if (min_len(cut.second) < min_allowed_length) {
+            DEBUG_PRINTF("cut uses a bad literal\n");
+            return false;
+        }
+
+        for (const auto &lit : cut.second) {
+            if (lit.length() == 2) {
+                len_2_count++;
+            }
+        }
+    }
+
+    if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
+                     const vector<RoseInEdge> &to_cut,
+                     const vector<NFAEdge> &cut,
+                     const map<NFAEdge, set<ue2_literal>> &cut_lits) {
+    DEBUG_PRINTF("splitting %s:\n", to_string(h.kind).c_str());
+
+    /* create literal vertices and connect preds */
+    unordered_set<RoseInVertex> done_sources;
+    map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source;
+    for (const RoseInEdge &ve : to_cut) {
+        assert(&h == &*vg[ve].graph);
+        RoseInVertex src = source(ve, vg);
+        if (!done_sources.insert(src).second) {
+            continue; /* already processed */
+        }
+
+        /* iterate over cut for determinism */
+        for (const auto &e : cut) {
+            NFAVertex prev_v = source(e, h);
+            NFAVertex pivot = target(e, h);
+
+            DEBUG_PRINTF("splitting on pivot %u\n", h[pivot].index);
+            ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+            shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+            splitLHS(h, pivot, new_lhs.get(), &temp_map);
+
+            /* want to cut off paths to pivot from things other than the pivot -
+             * makes a more svelte graphy */
+            clear_in_edges(temp_map[pivot], *new_lhs);
+            add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs);
+
+            pruneUseless(*new_lhs, false);
+            renumber_vertices(*new_lhs);
+            renumber_edges(*new_lhs);
+
+            DEBUG_PRINTF("    into lhs %s\n", to_string(new_lhs->kind).c_str());
+
+            assert(hasCorrectlyNumberedVertices(*new_lhs));
+            assert(hasCorrectlyNumberedEdges(*new_lhs));
+
+            const set<ue2_literal> &lits = cut_lits.at(e);
+            for (const auto &lit : lits) {
+                if (!can_match(*new_lhs, lit, is_triggered(h))) {
+                    continue;
+                }
+
+                RoseInVertex v
+                    = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+                /* if this is a prefix/infix an edge directly to accept should
+                 * represent a false path as we have poisoned vertices covered
+                 * by the literals. */
+                if (generates_callbacks(h)) {
+                    if (edge(pivot, h.accept, h).second) {
+                        DEBUG_PRINTF("adding acceptEod\n");
+                        /* literal has a direct connection to accept */
+                        const flat_set<ReportID> &reports = h[pivot].reports;
+                        auto tt = add_vertex(
+                                    RoseInVertexProps::makeAccept(reports), vg);
+                        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+                    }
+
+                    if (edge(pivot, h.acceptEod, h).second) {
+                        assert(generates_callbacks(h));
+                        DEBUG_PRINTF("adding acceptEod\n");
+                        /* literal has a direct connection to accept */
+                        const flat_set<ReportID> &reports = h[pivot].reports;
+                        auto tt = add_vertex(
+                                 RoseInVertexProps::makeAcceptEod(reports), vg);
+                        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+                    }
+                }
+
+                add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
+                verts_by_source[src].push_back({v, pivot});
+            }
+        }
+    }
+
+    /* wire the literal vertices up to successors */
+    map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
+    for (const RoseInEdge &ve : to_cut) {
+        RoseInVertex src = source(ve, vg);
+        RoseInVertex dest = target(ve, vg);
+
+        /* iterate over cut for determinism */
+        for (const auto &elem : verts_by_source[src]) {
+            NFAVertex pivot = elem.second;
+            RoseInVertex v = elem.first;
+
+            vector<NFAVertex> adj;
+            insert(&adj, adj.end(), adjacent_vertices(pivot, h));
+            /* we can ignore presence of accept, accepteod in adj as it is best
+               effort */
+
+            if (!contains(done_rhs, adj)) {
+                ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+                shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+                splitRHS(h, adj, new_rhs.get(), &temp_map);
+                remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
+                remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
+                renumber_edges(*new_rhs);
+                DEBUG_PRINTF("    into rhs %s\n",
+                              to_string(new_rhs->kind).c_str());
+                done_rhs.emplace(adj, new_rhs);
+            }
+
+            assert(done_rhs[adj].get());
+            shared_ptr<NGHolder> new_rhs = done_rhs[adj];
+
+            assert(hasCorrectlyNumberedVertices(*new_rhs));
+            assert(hasCorrectlyNumberedEdges(*new_rhs));
+
+            if (vg[dest].type == RIV_LITERAL
+                && !can_match(*new_rhs, vg[dest].s, true)) {
+                continue;
+            }
+
+            if (out_degree(new_rhs->start, *new_rhs) != 1) {
+                add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
+            }
+        }
+
+        remove_edge(ve, vg);
+    }
+}
+
+static
+bool doNetflowCut(NGHolder &h,
+                  const vector<NFAVertexDepth> *depths,
+                  RoseInGraph &vg,
+                  const vector<RoseInEdge> &ee, bool for_prefix,
+                  const Grey &grey, u32 min_allowed_length = 0U) {
+    ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
+
+    DEBUG_PRINTF("doing netflow cut\n");
+    /* TODO: we should really get literals/scores from the full graph as this
+     * allows us to overlap with previous cuts. */
+    assert(!ee.empty());
+    assert(&h == &*vg[ee.front()].graph);
+    assert(!for_prefix || depths);
+
+    if (num_edges(h) > grey.maxRoseNetflowEdges) {
+        /* We have a limit on this because scoring edges and running netflow
+         * gets very slow for big graphs. */
+        DEBUG_PRINTF("too many edges, skipping netflow cut\n");
+        return false;
+    }
+
+    assert(hasCorrectlyNumberedVertices(h));
+    assert(hasCorrectlyNumberedEdges(h));
+
+    auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
+
+    /* Step 1: Get scores for all edges */
+    vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
+
+    /* Step 2: Find cutset based on scores */
+    vector<NFAEdge> cut = findMinCut(h, scores);
+
+    /* Step 3: Get literals corresponding to cut edges */
+    map<NFAEdge, set<ue2_literal>> cut_lits;
+    for (const auto &e : cut) {
+        set<ue2_literal> lits = getLiteralSet(h, e);
+        sanitizeAndCompressAndScore(lits);
+
+        cut_lits[e] = lits;
+    }
+
+    /* if literals are underlength bail or if it involves a forbidden edge*/
+    if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
+        return false;
+    }
+    DEBUG_PRINTF("splitting\n");
+
+    /* Step 4: Split graph based on cuts */
+    splitEdgesByCut(h, vg, ee, cut, cut_lits);
+
+    return true;
+}
+
+static
+bool deanchorIfNeeded(NGHolder &g) {
+    DEBUG_PRINTF("hi\n");
+    if (proper_out_degree(g.startDs, g)) {
+        return false;
+    }
+
+    /* look for a non-special dot with a loop following start */
+    set<NFAVertex> succ_g;
+    insert(&succ_g, adjacent_vertices(g.start, g));
+    succ_g.erase(g.startDs);
+
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        DEBUG_PRINTF("inspecting cand %u || = %zu\n", g[v].index,
+                     g[v].char_reach.count());
+
+        if (v == g.startDs || !g[v].char_reach.all()) {
+            continue;
+        }
+
+        set<NFAVertex> succ_v;
+        insert(&succ_v, adjacent_vertices(v, g));
+
+        if (succ_v == succ_g) {
+            DEBUG_PRINTF("found ^.*\n");
+            for (auto succ : adjacent_vertices_range(g.start, g)) {
+                if (succ == g.startDs) {
+                    continue;
+                }
+                add_edge(g.startDs, succ, g);
+            }
+            clear_vertex(v, g);
+            remove_vertex(v, g);
+            renumber_vertices(g);
+            return true;
+        }
+
+        if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
+            DEBUG_PRINTF("found ^.+\n");
+            add_edge(g.startDs, v, g);
+            remove_edge(v, v, g);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static
+RoseInGraph populateTrivialGraph(const NGHolder &h) {
+    RoseInGraph g;
+    shared_ptr<NGHolder> root_g = cloneHolder(h);
+    bool orig_anch = isAnchored(*root_g);
+    orig_anch |= deanchorIfNeeded(*root_g);
+
+    DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
+
+    auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
+    auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
+
+    add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
+
+    return g;
+}
+
+static
+void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
+    if (num_vertices(vg) > 2) {
+        /* must be at least one literal aside from start and accept */
+        return;
+    }
+
+    RoseInEdge e = *edges(vg).first;
+
+    NGHolder &h = *vg[e].graph;
+
+    renumber_vertices(h);
+    renumber_edges(h);
+
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
+
+    if (split && splitRoseEdge(h, vg, {e}, *split)) {
+        DEBUG_PRINTF("split on simple literal\n");
+    } else {
+        doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
+    }
+}
+
+static
+void removeRedundantPrefixes(RoseInGraph &g) {
+    STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
+
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+        const ue2_literal &lit = g[t].s;
+
+        if (!literalIsWholeGraph(*g[e].graph, lit)) {
+            DEBUG_PRINTF("not whole graph\n");
+            continue;
+        }
+
+        if (!isFloating(*g[e].graph)) {
+            DEBUG_PRINTF("not floating\n");
+            continue;
+        }
+        g[e].graph.reset();
+    }
+}
+
+static
+u32 maxDelay(const CompileContext &cc) {
+    if (!cc.streaming) {
+        return MO_INVALID_IDX;
+    }
+    return cc.grey.maxHistoryAvailable;
+}
+
+static
+void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
+                                         const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
+
+    vector<RoseInEdge> to_anchor;
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
+            continue;
+        }
+
+        if (g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+        const ue2_literal &lit = g[t].s;
+
+        DEBUG_PRINTF("removing states for literal: %s\n",
+                     dumpString(lit).c_str());
+
+        unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
+        const u32 max_delay = maxDelay(cc);
+
+        u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
+                                              false /* can't overhang start */);
+
+        DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
+
+        if (edge(h->startDs, h->accept, *h).second) {
+            /* we should have delay == lit.length(), but in really complex
+             * cases we may fail to identify that we can remove the whole
+             * graph. Regardless, the fact that sds is wired to accept means the
+             * graph serves no purpose. */
+            DEBUG_PRINTF("whole graph\n");
+            g[e].graph.reset();
+            continue;
+        }
+
+        if (delay == lit.length() && edge(h->start, h->accept, *h).second
+            && num_vertices(*h) == N_SPECIALS) {
+            to_anchor.push_back(e);
+            continue;
+        }
+
+        /* if we got here we should still have an interesting graph */
+        assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
+
+        if (delay && delay != MO_INVALID_IDX) {
+            DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
+
+            g[e].graph = move(h);
+            g[e].graph_lag = delay;
+        }
+    }
+
+    if (!to_anchor.empty()) {
+        RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
+
+        for (RoseInEdge e : to_anchor) {
+            DEBUG_PRINTF("rehoming to anchor\n");
+            RoseInVertex v = target(e, g);
+            add_edge(anch, v, g);
+            remove_edge(e, g);
+        }
+    }
+}
+
+static
+bool isStarCliche(const NGHolder &g) {
+    DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
+
+    bool nonspecials_seen = false;
+
+    for (auto v : vertices_range(g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+
+        if (nonspecials_seen) {
+            return false;
+        }
+        nonspecials_seen = true;
+
+        if (!g[v].char_reach.all()) {
+            return false;
+        }
+
+        if (!hasSelfLoop(v, g)) {
+            return false;
+        }
+        if (!edge(v, g.accept, g).second) {
+            return false;
+        }
+    }
+
+    if (!nonspecials_seen) {
+        return false;
+    }
+
+    if (!edge(g.start, g.accept, g).second) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
+                                      const vector<RoseInEdge> &ee,
+                                      const CompileContext &cc) {
+    /* TODO: This could be better by not creating a separate graph for each
+     * successor literal. This would require using distinct report ids and also
+     * taking into account overlap of successor literals. */
+
+    set<ue2_literal> preds;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex u = source(e, ig);
+        assert(ig[u].type == RIV_LITERAL);
+        assert(!ig[e].graph_lag);
+        assert(!ig[u].delay);
+        preds.insert(ig[u].s);
+    }
+
+    set<ue2_literal> succs;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, ig);
+        assert(ig[v].type == RIV_LITERAL);
+        assert(!ig[v].delay);
+        succs.insert(ig[v].s);
+    }
+
+    map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
+
+    for (const ue2_literal &right : succs) {
+        size_t max_overlap = 0;
+        for (const ue2_literal &left : preds) {
+            size_t overlap = maxOverlap(left, right, 0);
+            ENSURE_AT_LEAST(&max_overlap, overlap);
+        }
+
+        u32 max_allowed_delay = right.length() - max_overlap;
+
+        if (cc.streaming) {
+            LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
+        }
+
+        if (!max_allowed_delay) {
+            continue;
+        }
+
+        shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+        u32 delay = removeTrailingLiteralStates(*h_new, right,
+                                                max_allowed_delay);
+
+        if (delay == MO_INVALID_IDX) {
+            /* successor literal could not match infix -> ignore false path */
+            assert(0);
+            continue;
+        }
+
+        graphs[right] = make_pair(h_new, delay);
+    }
+
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, ig);
+        const ue2_literal &succ = ig[v].s;
+        if (!contains(graphs, succ)) {
+            continue;
+        }
+
+        ig[e].graph = graphs[succ].first;
+        ig[e].graph_lag = graphs[succ].second;
+
+        if (isStarCliche(*ig[e].graph)) {
+            DEBUG_PRINTF("is a X star!\n");
+            ig[e].graph.reset();
+            ig[e].graph_lag = 0;
+        }
+    }
+}
+
+static
+void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
+                                        const CompileContext &cc) {
+    vector<NGHolder *> seen_order;
+    map<NGHolder *, vector<RoseInEdge>> infixes;
+
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+
+        NGHolder *h = g[e].graph.get();
+        if (!contains(infixes, h)) {
+            seen_order.push_back(h);
+        }
+        infixes[h].push_back(e);
+    }
+
+    for (NGHolder *h : seen_order) {
+        removeRedundantLiteralsFromInfix(*h, g, infixes[h], cc);
+    }
+}
+
+
+static
+void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
+    removeRedundantLiteralsFromPrefixes(g, cc);
+    removeRedundantLiteralsFromInfixes(g, cc);
+}
+
+static
+RoseInVertex getStart(RoseInGraph &vg) {
+    for (RoseInVertex v : vertices_range(vg)) {
+        if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
+            return v;
+        }
+    }
+    assert(0);
+    return RoseInGraph::null_vertex();
+}
+
+/**
+ * Finds the initial accept vertex created to which suffix/outfixes are
+ * attached.
+ */
+static
+RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
+    for (RoseInVertex v : vertices_range(vg)) {
+        if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
+            return v;
+        }
+    }
+    assert(0);
+    return RoseInGraph::null_vertex();
+}
+
+static
+bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
+    if (!cc.streaming) {
+        return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
+    } else {
+        return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
+    }
+}
+
+static
+bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
+    return max_depth <= depth(grey.maxAnchoredRegion);
+}
+
+static
+unique_ptr<NGHolder> make_chain(u32 count) {
+    assert(count);
+
+    auto rv = make_unique<NGHolder>(NFA_INFIX);
+
+    NGHolder &h = *rv;
+
+    NFAVertex u = h.start;
+    for (u32 i = 0; i < count; i++) {
+        NFAVertex v = add_vertex(h);
+        h[v].char_reach = CharReach::dot();
+        add_edge(u, v, h);
+        u = v;
+    }
+    h[u].reports.insert(0);
+    add_edge(u, h.accept, h);
+
+    return rv;
+}
+
+#define SHORT_TRIGGER_LEN 16
+
+static
+bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
+                                  const vector<RoseInEdge> &ee,
+                                  const CompileContext &cc) {
+    /* check max width and literal lengths to see if possible */
+    size_t min_lit = (size_t)~0ULL;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, vg);
+        LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
+    }
+
+    if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
+        return false;
+    }
+
+    depth max_width = findMaxWidth(h);
+
+    u32 delta = min_lit - SHORT_TRIGGER_LEN;
+
+    if (!willBeTransient(max_width - depth(delta), cc)
+        && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
+        return false;
+    }
+
+    DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
+    DEBUG_PRINTF("delta = %u\n", delta);
+
+    /* try split */
+    map<RoseInVertex, shared_ptr<NGHolder> > graphs;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, vg);
+
+        shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+        u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
+
+        DEBUG_PRINTF("delay %u\n", delay);
+
+        if (delay != delta) {
+            DEBUG_PRINTF("unable to trim literal\n");
+            return false;
+        }
+
+        if (in_degree(v, vg) != 1) {
+            DEBUG_PRINTF("complicated\n");
+            return false;
+        }
+
+        DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
+        assert(willBeTransient(findMaxWidth(*h_new), cc)
+               || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
+
+        graphs[v] = h_new;
+    }
+
+    /* add .{repeats} from prefixes to long literals */
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex s = source(e, vg);
+        RoseInVertex t = target(e, vg);
+
+        remove_edge(e, vg);
+        const ue2_literal &orig_lit = vg[t].s;
+
+        ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
+
+        ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
+
+        assert(lit.length() + delta == orig_lit.length());
+
+        vg[t].s = lit2;
+
+        RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+        add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
+        add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
+    }
+
+    DEBUG_PRINTF("success\n");
+    /* TODO: alter split point to avoid pathological splits */
+    return true;
+}
+
+static
+bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                   const CompileContext &cc) {
+    DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
+                  num_vertices(h));
+
+    renumber_vertices(h);
+    renumber_edges(h);
+
+    vector<NFAVertexDepth> depths;
+    calcDepths(h, depths);
+
+    /* If the reason the prefix is not transient is due to a very long literal
+     * following, we can make it transient by restricting ourselves to using
+     * just the head of the literal. */
+    if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
+        return true;
+    }
+
+    unique_ptr<VertLitInfo> split = findBestPrefixSplit(h, depths, vg, ee, cc);
+
+    if (split && (split->creates_transient || split->creates_anchored)
+        && splitRoseEdge(h, vg, ee, *split)) {
+        DEBUG_PRINTF("split on simple literal\n");
+        return true;
+    }
+
+    /* large back edges may prevent us identifing anchored or transient cases
+     * properly - use a simple walk instead */
+
+    if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
+        return true;
+    }
+
+    if (split && splitRoseEdge(h, vg, ee, *split)) {
+        /* use the simple split even though it doesn't create a transient
+         * prefix */
+        DEBUG_PRINTF("split on simple literal\n");
+        return true;
+    }
+
+    /* look for netflow cuts which don't produce good prefixes */
+    if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
+        return true;
+    }
+
+    if (ee.size() > 1) {
+        DEBUG_PRINTF("split the prefix apart based on succ literals\n");
+        unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
+                      NGHolderHasher, NGHolderEqual> trimmed;
+
+        for (const auto &e : ee) {
+            shared_ptr<NGHolder> hh = cloneHolder(h);
+            auto succ_lit = vg[target(e, vg)].s;
+            u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
+                                                    succ_lit.length(),
+                                              false /* can't overhang start */);
+            if (!delay) {
+                DEBUG_PRINTF("could not remove any literal, skip over\n");
+                continue;
+            }
+
+            trimmed[hh].emplace_back(e, delay);
+        }
+
+        if (trimmed.size() == 1) {
+            return false;
+        }
+
+        /* shift the contents to a vector so we can modify the graphs without
+         * violating the map's invariants. */
+        vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
+            trimmed_vec(trimmed.begin(), trimmed.end());
+        trimmed.clear();
+        for (auto &elem : trimmed_vec) {
+            shared_ptr<NGHolder> &hp = elem.first;
+            NGHolder &eh = *hp;
+
+            vector<NFAVertex> base_states;
+            insert(&base_states, base_states.end(),
+                   inv_adjacent_vertices(eh.accept, eh));
+            clear_in_edges(eh.accept, eh);
+
+            for (auto v : base_states) {
+                eh[v].reports.clear(); /* clear report from old accepts */
+            }
+
+            for (const auto &edge_delay : elem.second) {
+                const RoseInEdge &e = edge_delay.first;
+                u32 delay = edge_delay.second;
+                auto succ_lit = vg[target(e, vg)].s;
+
+                vg[e].graph = hp;
+                assert(delay <= succ_lit.length());
+                restoreTrailingLiteralStates(*vg[e].graph, succ_lit, delay,
+                                             base_states);
+            }
+        }
+        return true;
+    }
+
+    return false;
+}
+
+#define MAX_FIND_BETTER_PREFIX_GEN   4
+#define MAX_FIND_BETTER_PREFIX_COUNT 100
+
+static
+void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
+    RoseInVertex start = getStart(vg);
+
+    bool changed;
+    u32 gen = 0;
+    do {
+        DEBUG_PRINTF("gen %u\n", gen);
+        changed = false;
+        vector<NGHolder *> seen_order;
+        map<NGHolder *, vector<RoseInEdge> > prefixes;
+
+        /* find prefixes */
+        for (const RoseInEdge &e : out_edges_range(start, vg)) {
+            /* outfixes shouldn't have made it this far */
+            assert(vg[target(e, vg)].type == RIV_LITERAL);
+            if (vg[e].graph) {
+                NGHolder *h = vg[e].graph.get();
+                if (!contains(prefixes, h)) {
+                    seen_order.push_back(h);
+                }
+                prefixes[h].push_back(e);
+            }
+        }
+
+        if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
+            break;
+        }
+
+        /* look for bad prefixes and try to split */
+        for (NGHolder *h : seen_order) {
+            depth max_width = findMaxWidth(*h);
+            if (willBeTransient(max_width, cc)
+                || willBeAnchoredTable(max_width, cc.grey)) {
+                continue;
+            }
+
+            changed = improvePrefix(*h, vg, prefixes[h], cc);
+        }
+    } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
+}
+
+#define STRONG_LITERAL_LENGTH 20
+#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
+
+static
+bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
+                          const vector<RoseInEdge> &ee,
+                          const CompileContext &cc) {
+    DEBUG_PRINTF("looking for string literal\n");
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+    if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
+        DEBUG_PRINTF("splitting simple literal\n");
+        return splitRoseEdge(h, vg, ee, *split);
+    }
+
+    return false;
+}
+
+static
+void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetExtractStrongLiterals) {
+        return;
+    }
+    STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
+    set<NGHolder *> stuck;
+
+    bool changed;
+    do {
+        changed = false;
+
+        vector<NGHolder *> seen_order;
+        map<NGHolder *, vector<RoseInEdge> > edges_by_graph;
+        for (const RoseInEdge &ve : edges_range(vg)) {
+            if (vg[source(ve, vg)].type != RIV_LITERAL) {
+                continue;
+            }
+            if (vg[ve].graph) {
+                if (!contains(edges_by_graph, vg[ve].graph.get())) {
+                    seen_order.push_back(vg[ve].graph.get());
+                }
+                edges_by_graph[vg[ve].graph.get()].push_back(ve);
+            }
+        }
+
+        if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
+            DEBUG_PRINTF("too many graphs, stopping\n");
+            return;
+        }
+
+        for (NGHolder *g : seen_order) {
+            if (contains(stuck, g)) {
+                DEBUG_PRINTF("already known to be bad\n");
+                continue;
+            }
+            bool rv = extractStrongLiteral(*g, vg, edges_by_graph[g], cc);
+            if (rv) {
+                changed = true;
+            } else {
+                stuck.insert(g);
+            }
+        }
+    } while (changed);
+}
+
+#define INFIX_STRONG_GUARD_LEN 8
+#define INFIX_MIN_SPLIT_LITERAL_LEN 12
+
+static
+bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                  const CompileContext &cc) {
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+    if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
+        && splitRoseEdge(h, vg, ee, *split)) {
+        DEBUG_PRINTF("splitting simple literal\n");
+        return true;
+    }
+
+    DEBUG_PRINTF("trying for a netflow cut\n");
+    /* look for netflow cuts which don't produce good prefixes */
+    bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
+
+    DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
+
+    return rv;
+}
+
+/**
+ * Infixes which are weakly guarded can, in effect, act like prefixes as they
+ * will often be live. We should try to split these infixes further if they
+ * contain strong literals so that we are at least running smaller weak infixes
+ * which can hopeful be accelerated/miracled.
+ */
+static
+void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetAvoidWeakInfixes) {
+        return;
+    }
+    STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
+
+    RoseInVertex start = getStart(vg);
+
+    set<NGHolder *> weak;
+    vector<NGHolder *> ordered_weak;
+
+    for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
+        /* outfixes shouldn't have made it this far */
+        assert(vg[vv].type == RIV_LITERAL);
+        if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
+            continue;
+        }
+
+        for (const RoseInEdge &e : out_edges_range(vv, vg)) {
+            if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
+                continue;
+            }
+
+            NGHolder *h = vg[e].graph.get();
+            DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
+            if (!contains(weak, h)) {
+                weak.insert(h);
+                ordered_weak.push_back(h);
+            }
+        }
+    }
+
+    map<NGHolder *, vector<RoseInEdge> > weak_edges;
+    for (const RoseInEdge &ve : edges_range(vg)) {
+        if (contains(weak, vg[ve].graph.get())) {
+            weak_edges[vg[ve].graph.get()].push_back(ve);
+        }
+    }
+
+    for (NGHolder *h : ordered_weak) {
+        improveInfix(*h, vg, weak_edges[h], cc);
+    }
+}
+
+static
+void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
+                         const vector<RoseInEdge> &ee, const VertLitInfo &split,
+                         bool eod, const flat_set<ReportID> &reports) {
+    const vector<NFAVertex> &splitters = split.vv;
+    assert(!splitters.empty());
+
+    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    unordered_map<NFAVertex, NFAVertex> v_map;
+    cloneHolder(*lhs, base_graph, &v_map);
+    lhs->kind = NFA_INFIX;
+    clear_in_edges(lhs->accept, *lhs);
+    clear_in_edges(lhs->acceptEod, *lhs);
+    add_edge(lhs->accept, lhs->acceptEod, *lhs);
+    clearReports(*lhs);
+    for (NFAVertex v : splitters) {
+        add_edge(v_map[v], lhs->accept, *lhs);
+        (*lhs)[v_map[v]].reports.insert(0);
+    }
+    pruneUseless(*lhs);
+
+    /* create literal vertices and connect preds */
+    for (const auto &lit : split.lit) {
+        if (!can_match(*lhs, lit, is_triggered(*lhs))) {
+            continue;
+        }
+
+        DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+        RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+        RoseInVertex tt;
+        if (eod) {
+            DEBUG_PRINTF("doing eod\n");
+            tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
+        } else {
+            DEBUG_PRINTF("doing non-eod\n");
+            tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
+        }
+        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+
+        for (const RoseInEdge &e : ee) {
+            RoseInVertex u = source(e, vg);
+            assert(!edge(u, v, vg).second);
+            add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+        }
+    }
+}
+
+#define MIN_SUFFIX_LEN 6
+
+static
+bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
+                            const vector<RoseInEdge> &suffix_edges,
+                            const CompileContext &cc) {
+    DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
+                 suffix_edges.size());
+    /*
+     * We would, in general, rather not have output exposed engines because
+     * once they are triggered, they must be run while infixes only have to run
+     * if the successor literal is seen. Matches from output exposed engines
+     * also have to be placed in a priority queue and interleaved with matches
+     * from other sources.
+     *
+     * Note:
+     * - if the LHS is extremely unlikely we may be better off leaving
+     *   a suffix unguarded.
+     *
+     * - limited width suffixes may be less bad as they won't be continuously
+     *   active, we may want to have (a) stronger controls on if we want to pick
+     *   a trailing literal in these cases and/or (b) look also for literals
+     *   near accept as well as right on accept
+     *
+     * TODO: improve heuristics, splitting logic.
+     */
+
+    /* we may do multiple splits corresponding to different report behaviour */
+    set<NFAVertex> seen;
+    map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
+
+    for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
+        set<ue2_literal> ss = getLiteralSet(h, v, false);
+        if (ss.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            return false;
+        }
+
+        VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
+        insert(&vli.lit, ss);
+        vli.vv.push_back(v);
+        seen.insert(v);
+    }
+
+    seen.insert(h.accept);
+    for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
+        if (contains(seen, v)) {
+            continue;
+        }
+
+        set<ue2_literal> ss = getLiteralSet(h, v, false);
+        if (ss.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            return false;
+        }
+
+        VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
+        insert(&vli.lit, ss);
+        vli.vv.push_back(v);
+    }
+
+    assert(!by_reports.empty());
+
+    /* TODO: how strong a min len do we want here ? */
+    u32 min_len = cc.grey.minRoseLiteralLength;
+    ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
+
+    for (auto &vli : by_reports | map_values) {
+        u64a score = sanitizeAndCompressAndScore(vli.lit);
+
+        if (vli.lit.empty()
+            || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
+                                              false)) {
+            return false;
+        }
+    }
+
+    for (const auto &info : by_reports) {
+        DEBUG_PRINTF("splitting on simple literals\n");
+        splitEdgesForSuffix(h, vg, suffix_edges, info.second,
+                            info.first.first /* eod */,
+                            info.first.second /* reports */);
+    }
+
+    for (const RoseInEdge &e : suffix_edges) {
+        remove_edge(e, vg);
+    }
+    return true;
+}
+
+static
+void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetAvoidSuffixes) {
+        return;
+    }
+
+    STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
+
+    RoseInVertex accept = getPrimaryAccept(vg);
+    map<const NGHolder *, vector<RoseInEdge> > suffixes;
+    vector<const NGHolder *> ordered_suffixes;
+
+    /* find suffixes */
+    for (const RoseInEdge &e : in_edges_range(accept, vg)) {
+        /* outfixes shouldn't have made it this far */
+        assert(vg[source(e, vg)].type == RIV_LITERAL);
+        assert(vg[e].graph); /* non suffix paths should be wired to other
+                                accepts */
+        const NGHolder *h = vg[e].graph.get();
+        if (!contains(suffixes, h)) {
+            ordered_suffixes.push_back(h);
+        }
+        suffixes[h].push_back(e);
+    }
+
+    /* look at suffixes and try to split */
+    for (const NGHolder *h : ordered_suffixes) {
+        replaceSuffixWithInfix(*h, vg, suffixes[h], cc);
+    }
+}
+
+static
+bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
+    if (out_degree(h.start, h) != 3) {
+        return false;
+    }
+
+    NFAVertex v = NGHolder::null_vertex();
+    NFAVertex ds = NGHolder::null_vertex();
+
+    for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
+        if (a == h.startDs) {
+            continue;
+        }
+        if (h[a].char_reach.all()) {
+            ds = a;
+            if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
+                return false;
+            }
+        } else {
+            v = a;
+        }
+    }
+
+    if (!v || !ds || !edge(ds, v, h).second) {
+        return false;
+    }
+
+    if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+        return false;
+    }
+
+    ue2_literal lit;
+    lit.push_back(h[v].char_reach.find_first(),
+                  h[v].char_reach.isCaselessChar());
+    while (out_degree(v, h) == 1) {
+        NFAVertex vv = *adjacent_vertices(v, h).first;
+        if (h[vv].char_reach.count() != 1
+            && !h[vv].char_reach.isCaselessChar()) {
+            break;
+        }
+
+        v = vv;
+
+        lit.push_back(h[v].char_reach.find_first(),
+                      h[v].char_reach.isCaselessChar());
+    }
+
+    if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
+        /* we have rediscovered the post-infix literal */
+        return false;
+    }
+
+    if (bad_mixed_sensitivity(lit)) {
+        make_nocase(&lit);
+    }
+
+    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+    out->vv = {v};
+    out->lit = {lit};
+    return true;
+}
+
+static
+bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
+                      RoseInGraph &vg, const Grey &grey) {
+    VertLitInfo info;
+    if (!leadingDotStartLiteral(h, &info)
+        || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+        return false;
+    }
+    DEBUG_PRINTF("performing split\n");
+    return splitRoseEdge(h, vg, ee, {info});
+}
+
+static
+void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetDoubleCut) {
+        return;
+    }
+
+    map<const NGHolder *, vector<RoseInEdge> > right_edges;
+    vector<const NGHolder *> ordered_graphs;
+    for (const RoseInEdge &ve : edges_range(vg)) {
+        if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+            const NGHolder *h = vg[ve].graph.get();
+            if (!contains(right_edges, h)) {
+                ordered_graphs.push_back(h);
+            }
+            right_edges[h].push_back(ve);
+        }
+    }
+
+    for (const NGHolder *h : ordered_graphs) {
+        lookForDoubleCut(*h, right_edges[h], vg, cc.grey);
+    }
+}
+
+static
+pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
+    ue2_literal lit;
+    if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+        return {v, std::move(lit) };
+    }
+    lit.push_back(h[v].char_reach.find_first(),
+                  h[v].char_reach.isCaselessChar());
+
+    while (in_degree(v, h) == 1) {
+        NFAVertex vv = *inv_adjacent_vertices(v, h).first;
+        if (h[vv].char_reach.count() != 1
+            && !h[vv].char_reach.isCaselessChar()) {
+            break;
+        }
+
+        lit.push_back(h[vv].char_reach.find_first(),
+                      h[vv].char_reach.isCaselessChar());
+        v = vv;
+    }
+
+    return {v, std::move(lit) };
+}
+
+static
+bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
+                        NFAVertex *u, NFAVertex *ds) {
+    *u = NGHolder::null_vertex();
+    *ds = NGHolder::null_vertex();
+    for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
+        if (h[a].char_reach.all()) {
+            if (!edge(a, a, h).second) {
+                return false;
+            }
+
+            if (*ds) {
+                return false;
+            }
+
+            *ds = a;
+        } else {
+            if (*u) {
+                return false;
+            }
+            *u = a;
+        }
+    }
+
+    if (!*u || !*ds) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
+    /* Note: there is no delay yet - so the final literal is the already
+     * discovered successor literal - we are in fact interested in the literal
+     * before it. */
+
+    if (in_degree(h.accept, h) != 1) {
+        return false;
+    }
+
+    if (in_degree(h.acceptEod, h) != 1) {
+        assert(0);
+        return false;
+    }
+
+    NFAVertex v
+        = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
+
+    NFAVertex u;
+    NFAVertex ds;
+
+    if (!lookForDotStarPred(v, h, &u, &ds)) {
+        return false;
+    }
+
+    v = u;
+    auto rv = findLiteralBefore(h, v);
+
+    if (!lookForDotStarPred(v, h, &u, &ds)) {
+        return false;
+    }
+
+    ue2_literal lit = reverse_literal(rv.second);
+    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+
+    if (bad_mixed_sensitivity(lit)) {
+        make_nocase(&lit);
+    }
+
+    out->vv = {v};
+    out->lit = {lit};
+    return true;
+}
+
+static
+bool lookForTrailingLiteralDotStar(const NGHolder &h,
+                                   const vector<RoseInEdge> &ee,
+                                   RoseInGraph &vg, const Grey &grey) {
+    VertLitInfo info;
+    if (!trailingDotStarLiteral(h, &info)
+        || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+        return false;
+    }
+    DEBUG_PRINTF("performing split\n");
+    return splitRoseEdge(h, vg, ee, info);
+}
+
+/* In streaming mode, active engines have to be caught up at stream boundaries
+ * and have to be stored in stream state, so we prefer to decompose patterns
+ * in to literals with no state between them if possible. */
+static
+void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetLiteralChains) {
+        return;
+    }
+
+    bool changed;
+    do {
+        changed = false;
+
+        map<const NGHolder *, vector<RoseInEdge> > right_edges;
+        vector<const NGHolder *> ordered_graphs;
+        for (const RoseInEdge &ve : edges_range(vg)) {
+            if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+                const NGHolder *h = vg[ve].graph.get();
+                if (!contains(right_edges, h)) {
+                    ordered_graphs.push_back(h);
+                }
+                right_edges[h].push_back(ve);
+            }
+        }
+
+        for (const NGHolder *h : ordered_graphs) {
+            const vector<RoseInEdge> &ee = right_edges[h];
+            bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
+            if (!rv && h->kind != NFA_SUFFIX) {
+                rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
+            }
+            changed |= rv;
+        }
+    } while (changed);
+}
+
+static
+bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
+                       RoseInGraph &vg, const CompileContext &cc) {
+    unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
+
+    if (split) {
+        return splitRoseEdge(h, vg, {ee}, *split);
+    }
+
+    return false;
+}
+
+#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
+
+static
+void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
+    u32 gen = 0;
+
+    vector<RoseInVertex> prev = {getStart(vg)};
+
+    while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
+        /* collect vertices in edge order for determinism */
+        vector<RoseInVertex> curr;
+        set<RoseInVertex> curr_seen;
+        for (RoseInVertex u : prev) {
+            for (auto v : adjacent_vertices_range(u, vg)) {
+                if (curr_seen.insert(v).second) {
+                    curr.push_back(v);
+                }
+            }
+        }
+
+        map<const NGHolder *, vector<RoseInEdge>> rightfixes;
+        vector<NGHolder *> ordered_graphs;
+        for (RoseInVertex v : curr) {
+            for (const RoseInEdge &e : out_edges_range(v, vg)) {
+                if (vg[e].graph) {
+                    NGHolder *h = vg[e].graph.get();
+                    if (!contains(rightfixes, h)) {
+                        ordered_graphs.push_back(h);
+                    }
+                    rightfixes[h].push_back(e);
+                }
+            }
+        }
+
+        for (const NGHolder *h : ordered_graphs) {
+            lookForCleanSplit(*h, rightfixes[h], vg, cc);
+        }
+
+        prev = curr;
+        gen++;
+    }
+}
+
+static
+void rehomeEodSuffixes(RoseInGraph &vg) {
+    // Find edges to accept with EOD-anchored graphs that we can move over to
+    // acceptEod.
+    vector<RoseInEdge> acc_edges;
+    for (const auto &e : edges_range(vg)) {
+        if (vg[target(e, vg)].type != RIV_ACCEPT) {
+            continue;
+        }
+        if (vg[e].haig || !vg[e].graph) {
+            continue;
+        }
+
+        const NGHolder &h = *vg[e].graph;
+
+        if (in_degree(h.accept, h)) {
+            DEBUG_PRINTF("graph isn't eod anchored\n");
+            continue;
+        }
+
+        acc_edges.push_back(e);
+    }
+
+    for (const RoseInEdge &e : acc_edges) {
+        // Move this edge from accept to acceptEod
+        RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
+        add_edge(source(e, vg), w, vg[e], vg);
+        remove_edge(e, vg);
+    }
+
+    /* old accept vertices will be tidied up by final pruneUseless() call */
+}
+
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              const CompileContext &cc) {
+    assert(!can_never_match(h));
+
+    if (!cc.grey.allowViolet) {
+        return false;
+    }
+
+    DEBUG_PRINTF("hello world\n");
+
+    RoseInGraph vg = populateTrivialGraph(h);
+
+    /* Step 1: avoid outfixes as we always have to run them. */
+    avoidOutfixes(vg, cc);
+
+    if (num_vertices(vg) <= 2) {
+        /* only have an outfix; leave for ng_rose for now */
+        return false;
+    }
+
+    removeRedundantPrefixes(vg);
+    dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
+
+    /* Step 2: avoid non-transient prefixes (esp in streaming mode) */
+    findBetterPrefixes(vg, cc);
+
+    dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
+
+    extractStrongLiterals(vg, cc);
+    dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
+    improveWeakInfixes(vg, cc);
+    dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
+
+    /* Step 3: avoid output exposed engines if there is a strong trailing
+       literal) */
+    avoidSuffixes(vg, cc);
+
+    /* Step 4: look for infixes/suffixes with leading .*literals
+     * This can reduce the amount of work a heavily picked literal has to do and
+     * reduce the amount of state used as .* is handled internally to rose. */
+    lookForDoubleCut(vg, cc);
+
+    if (cc.streaming) {
+        lookForCleanEarlySplits(vg, cc);
+        decomposeLiteralChains(vg, cc);
+    }
+
+    /* Step 5: avoid unimplementable, or overly large engines if possible */
+    /* TODO: later - ng_rose is currently acting as a backstop */
+
+    /* Step 6: send to rose */
+    rehomeEodSuffixes(vg);
+    removeRedundantLiterals(vg, cc);
+
+    pruneUseless(vg);
+    dumpPreRoseGraph(vg, cc.grey);
+    calcVertexOffsets(vg);
+    bool rv = rose.addRose(vg, prefilter);
+    DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+    return rv;
+}
+
+}
diff --git a/src/nfagraph/ng_violet.h b/src/nfagraph/ng_violet.h
new file mode 100644
index 00000000..fb62bfc0
--- /dev/null
+++ b/src/nfagraph/ng_violet.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Violet method of rose construction from NGHolder.
+ */
+
+#ifndef NG_VIOLET_H
+#define NG_VIOLET_H
+
+#include "ue2common.h"
+
+namespace ue2 {
+
+class NGHolder;
+class RoseBuild;
+
+struct CompileContext;
+
+/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
+ * Returns true if successful. */
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              const CompileContext &cc);
+
+} // namespace ue2
+
+#endif
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 65cd7c1a..53130ddf 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1226,9 +1226,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               '\\Q' => {
                   fgoto readQuotedLiteral;
               };
-              '\\E' => {
-                  throw LocatedParseError("Unmatched \\E");
-              };
+              # An \E that is not preceded by a \Q is ignored
+              '\\E' => { /* noop */ };
               # Match any character
               '\.' => {
                   currentSeq->addComponent(generateComponent(CLASS_ANY, false, mode));
@@ -1447,12 +1446,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                       // Otherwise, we interpret the first three digits as an
                       // octal escape, and the remaining characters stand for
                       // themselves as literals.
-                      const u8 *p = ts;
+                      const u8 *s = ts;
                       unsigned int accum = 0;
                       unsigned int oct_digits = 0;
-                      assert(*p == '\\'); // token starts at backslash
-                      for (++p; p < te && oct_digits < 3; ++oct_digits, ++p) {
-                          u8 digit = *p - '0';
+                      assert(*s == '\\'); // token starts at backslash
+                      for (++s; s < te && oct_digits < 3; ++oct_digits, ++s) {
+                          u8 digit = *s - '0';
                           if (digit < 8) {
                               accum = digit + accum * 8;
                           } else {
@@ -1465,8 +1464,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                       }
 
                       // And then the rest of the digits, if any, are literal.
-                      for (; p < te; ++p) {
-                          addLiteral(currentSeq, *p, mode);
+                      for (; s < te; ++s) {
+                          addLiteral(currentSeq, *s, mode);
                       }
                   }
               };
diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp
index f6f5d383..3f58d752 100644
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -162,7 +162,7 @@ ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
 bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
     assert(expr.component);
 
-    if (!ng.cc.grey.allowRose) {
+    if (!ng.cc.grey.allowLiteral) {
         return false;
     }
 
diff --git a/src/report.h b/src/report.h
index d037d11b..4a5f401e 100644
--- a/src/report.h
+++ b/src/report.h
@@ -115,6 +115,42 @@ enum DedupeResult dedupeCatchup(const struct RoseEngine *rose,
     return DEDUPE_CONTINUE;
 }
 
+/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector
+ * \a evec. */
+static really_inline
+int isExhausted(const struct RoseEngine *rose, const char *evec, u32 ekey) {
+    DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    return mmbit_isset((const u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */
+static really_inline
+int isAllExhausted(const struct RoseEngine *rose, const char *evec) {
+    if (!rose->canExhaust) {
+        return 0; /* pattern set is inexhaustible */
+    }
+
+    return mmbit_all((const u8 *)evec, rose->ekeyCount);
+}
+
+/** \brief Mark key \a ekey on in the exhaustion vector. */
+static really_inline
+void markAsMatched(const struct RoseEngine *rose, char *evec, u32 ekey) {
+    DEBUG_PRINTF("marking as exhausted key %u\n", ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    mmbit_set((u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Clear all keys in the exhaustion vector. */
+static really_inline
+void clearEvec(const struct RoseEngine *rose, char *evec) {
+    DEBUG_PRINTF("clearing evec %p %u\n", evec, rose->ekeyCount);
+    mmbit_clear((u8 *)evec, rose->ekeyCount);
+}
+
 /**
  * \brief Deliver the given report to the user callback.
  *
diff --git a/src/rose/block.c b/src/rose/block.c
index 5fc5c8a1..fc72c6e9 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -29,13 +29,14 @@
 #include "catchup.h"
 #include "init.h"
 #include "match.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "rose_common.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_rev_api.h"
 #include "nfa/mcclellan.h"
 #include "util/fatbit.h"
-#include "rose.h"
-#include "rose_common.h"
 
 static rose_inline
 void runAnchoredTableBlock(const struct RoseEngine *t, const void *atable,
@@ -157,13 +158,213 @@ void init_for_block(const struct RoseEngine *t, struct hs_scratch *scratch,
     init_outfixes_for_block(t, scratch, state, is_small_block);
 }
 
-void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
+static rose_inline
+void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
+                      struct hs_scratch *scratch) {
+    assert(t->requiresEodCheck);
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || offset <= t->maxBiAnchoredWidth);
+
+    assert(!can_stop_matching(scratch));
+    assert(t->eodProgramOffset);
+
+    // Ensure that history is correct before we look for EOD matches.
+    roseFlushLastByteHistory(t, scratch, offset);
+    scratch->tctxt.lastEndOffset = offset;
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const size_t match_len = 0;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
+                   flags);
+}
+
+/**
+ * \brief Run the anchored matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockAnchored(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const void *atable = getALiteralMatcher(t);
+    if (!atable) {
+        DEBUG_PRINTF("no anchored table\n");
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+
+    if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->amatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->amatcherMinWidth) {
+        return 0;
+    }
+
+    runAnchoredTableBlock(t, atable, scratch);
+
+    return can_stop_matching(scratch);
+}
+
+/**
+ * \brief Run the floating matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const struct HWLM *ftable = getFLiteralMatcher(t);
+    if (!ftable) {
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+    char *state = scratch->core_info.state;
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance,
+                 t->floatingMinDistance);
+    if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
+        DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
+        return 0;
+    }
+
+    if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->fmatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->fmatcherMinWidth) {
+        return 0;
+    }
+
+    const u8 *buffer = scratch->core_info.buf;
+    size_t flen = length;
+    if (t->floatingDistance != ROSE_BOUND_INF) {
+        flen = MIN(t->floatingDistance, length);
+    }
+    if (flen <= t->floatingMinDistance) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
+    DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
+    hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseFloatingCallback,
+             scratch, tctxt->groups & t->floating_group_mask);
+
+    return can_stop_matching(scratch);
+}
+
+static rose_inline
+void runEagerPrefixesBlock(const struct RoseEngine *t,
+                           struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u/%u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+
+        if (scratch->core_info.len < nfa->minWidth) {
+            /* we know that there is not enough data for this to ever match, so
+             * we can immediately squash/ */
+            mmbit_unset(ara, arCount, ri);
+            scratch->tctxt.groups &= left->squash_mask;
+        }
+
+        s64a loc = MIN(scratch->core_info.len, EAGER_STOP_OFFSET);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_TOP, 0);
+        pushQueueAt(q, 2, MQE_END, loc);
+        nfaQueueInitState(nfa, q);
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            if (loc == (s64a)scratch->core_info.len) {
+                /* We know that the prefix does not match in the block so we
+                 * can squash the groups anyway even though it did not die */
+                /* TODO: if we knew the minimum lag the leftfix is checked at we
+                 * could make this check tighter */
+                DEBUG_PRINTF("queue %u has no match in block, squashing\n", qi);
+                mmbit_unset(ara, arCount, ri);
+                fatbit_unset(scratch->aqa, qCount, qi);
+                scratch->tctxt.groups &= left->squash_mask;
+            } else {
+                DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+                q->cur = q->end = 0;
+                pushQueueAt(q, 0, MQE_START, loc);
+            }
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     assert(t);
     assert(scratch);
     assert(scratch->core_info.buf);
     assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount)
            < MAX_SPARSE_ITER_STATES);
 
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    // If this block is shorter than our minimum width, then no pattern in this
+    // RoseEngine could match.
+    /* minWidth checks should have already been performed by the caller */
+    assert(scratch->core_info.len >= t->minWidth);
+
+    // Similarly, we may have a maximum width (for engines constructed entirely
+    // of bi-anchored patterns).
+    /* This check is now handled by the interpreter */
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || scratch->core_info.len <= t->maxBiAnchoredWidth);
+
     const size_t length = scratch->core_info.len;
 
     // We have optimizations for small block scans: we run a single coalesced
@@ -189,65 +390,17 @@ void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
         DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
         hwlmExec(sbtable, scratch->core_info.buf, sblen, 0, roseCallback,
                  scratch, tctxt->groups);
-        goto exit;
+    } else {
+        runEagerPrefixesBlock(t, scratch);
+
+        if (roseBlockAnchored(t, scratch)) {
+            return;
+        }
+        if (roseBlockFloating(t, scratch)) {
+            return;
+        }
     }
 
-    const void *atable = getALiteralMatcher(t);
-
-    if (atable) {
-        if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF
-            && length > t->amatcherMaxBiAnchoredWidth) {
-            goto skip_atable;
-        }
-
-        if (length < t->amatcherMinWidth) {
-            goto skip_atable;
-        }
-
-
-        runAnchoredTableBlock(t, atable, scratch);
-
-        if (can_stop_matching(scratch)) {
-            goto exit;
-        }
-
-    skip_atable:;
-    }
-
-    const struct HWLM *ftable = getFLiteralMatcher(t);
-    if (ftable) {
-        DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance,
-            t->floatingMinDistance);
-        if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
-            DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
-            goto exit;
-        }
-
-        if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF
-            && length > t->fmatcherMaxBiAnchoredWidth) {
-            goto exit;
-        }
-
-        if (length < t->fmatcherMinWidth) {
-            goto exit;
-        }
-
-        const u8 *buffer = scratch->core_info.buf;
-        size_t flen = length;
-        if (t->floatingDistance != ROSE_BOUND_INF) {
-            flen = MIN(t->floatingDistance, length);
-        }
-        if (flen <= t->floatingMinDistance) {
-            goto exit;
-        }
-
-        DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
-        DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
-        hwlmExec(ftable, buffer, flen, t->floatingMinDistance,
-                 roseCallback, scratch, tctxt->groups);
-    }
-
-exit:;
     if (cleanUpDelayed(t, scratch, length, 0) == HWLM_TERMINATE_MATCHING) {
         return;
     }
@@ -255,4 +408,16 @@ exit:;
     assert(!can_stop_matching(scratch));
 
     roseCatchUpTo(t, scratch, length);
+
+    if (!t->requiresEodCheck || !t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod check required\n");
+        return;
+    }
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("bailing, already halted\n");
+        return;
+    }
+
+    roseBlockEodExec(t, length, scratch);
 }
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index dba9629e..017a6bf0 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -39,6 +39,7 @@
 #include "nfa/mpv.h"
 #include "som/som_runtime.h"
 #include "util/fatbit.h"
+#include "report.h"
 
 typedef struct queue_match PQ_T;
 #define PQ_COMP(pqc_items, a, b) ((pqc_items)[a].loc < (pqc_items)[b].loc)
@@ -51,14 +52,49 @@ int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch,
                       u64a som, u64a offset, ReportID id, const char from_mpv) {
     const u32 program = id;
     const size_t match_len = 0; // Unused in this path.
-    const char in_anchored = 0;
-    const char in_catchup = 1;
-    roseRunProgram(rose, scratch, program, som, offset, match_len, in_anchored,
-                   in_catchup, from_mpv, 0);
+    u8 flags = ROSE_PROG_FLAG_IN_CATCHUP;
+    if (from_mpv) {
+        flags |= ROSE_PROG_FLAG_FROM_MPV;
+    }
+
+    roseRunProgram(rose, scratch, program, som, offset, match_len, flags);
 
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
 
+static rose_inline
+char roseSuffixInfoIsExhausted(const struct RoseEngine *rose,
+                               const struct NfaInfo *info,
+                               const char *exhausted) {
+    if (!info->ekeyListOffset) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset);
+
+    /* INVALID_EKEY terminated list */
+    const u32 *ekeys = getByOffset(rose, info->ekeyListOffset);
+    while (*ekeys != INVALID_EKEY) {
+        DEBUG_PRINTF("check %u\n", *ekeys);
+        if (!isExhausted(rose, exhausted, *ekeys)) {
+            DEBUG_PRINTF("not exhausted -> alive\n");
+            return 0;
+        }
+        ++ekeys;
+    }
+
+    DEBUG_PRINTF("all ekeys exhausted -> dead\n");
+    return 1;
+}
+
+static really_inline
+char roseSuffixIsExhausted(const struct RoseEngine *rose, u32 qi,
+                           const char *exhausted) {
+    DEBUG_PRINTF("check queue %u\n", qi);
+    const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
+    return roseSuffixInfoIsExhausted(rose, info, exhausted);
+}
+
 static really_inline
 void deactivateQueue(const struct RoseEngine *t, u8 *aa, u32 qi,
                      struct hs_scratch *scratch) {
@@ -245,14 +281,14 @@ restart:
 
 /* for use by mpv (chained) only */
 static
-int roseNfaFinalBlastAdaptor(u64a offset, ReportID id, void *context) {
+int roseNfaFinalBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     const struct RoseEngine *t = scratch->core_info.rose;
 
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, 1);
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, 1);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
@@ -358,7 +394,6 @@ hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc,
     assert(!q->report_current);
 
     q->cb = roseNfaFinalBlastAdaptor;
-    q->som_cb = NULL;
 
     DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n",
                   qi, q->cur, q->end, q->items[q->cur].location, loc);
@@ -413,113 +448,47 @@ char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) {
 }
 
 static
-int roseNfaBlastAdaptor(u64a offset, ReportID id, void *context) {
+int roseNfaBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     const struct RoseEngine *t = scratch->core_info.rose;
 
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
     const char from_mpv = in_mpv(t, scratch);
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, from_mpv);
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, from_mpv);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
         return MO_CONTINUE_MATCHING;
     } else {
         assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
+        return !roseSuffixIsExhausted(t, scratch->tctxt.curr_qi,
                                       scratch->core_info.exhaustionVector);
     }
 }
 
-static
-int roseNfaBlastAdaptorNoInternal(u64a offset, ReportID id, void *context) {
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct RoseEngine *t = scratch->core_info.rose;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
 
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
-
-    assert(!in_mpv(t, scratch));
-
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, 0);
-    if (cb_rv == MO_HALT_MATCHING) {
-        return MO_HALT_MATCHING;
-    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return MO_CONTINUE_MATCHING;
-    } else {
-        assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
-                                      scratch->core_info.exhaustionVector);
-    }
-}
-
-static
-int roseNfaBlastSomAdaptor(u64a from_offset, u64a offset, ReportID id,
-                           void *context) {
-    struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct RoseEngine *t = scratch->core_info.rose;
-
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
-
-    assert(!in_mpv(t, scratch));
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
     /* must be a external report as haig cannot directly participate in chain */
-    int cb_rv = roseNfaRunProgram(scratch->core_info.rose, scratch, from_offset,
-                                  offset, id, 0);
-    if (cb_rv == MO_HALT_MATCHING) {
-        return MO_HALT_MATCHING;
-    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return MO_CONTINUE_MATCHING;
-    } else {
-        assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
-                                      scratch->core_info.exhaustionVector);
-    }
-}
-
-int roseNfaAdaptor(u64a offset, ReportID id, void *context) {
-    struct hs_scratch *scratch = context;
-    DEBUG_PRINTF("masky got himself a match @%llu id %u !woot!\n", offset, id);
-
-    return roseNfaRunProgram(scratch->core_info.rose, scratch, 0, offset, id,
+    return roseNfaRunProgram(scratch->core_info.rose, scratch, start, end, id,
                              0);
 }
 
-int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id,
-                      void *context) {
-    struct hs_scratch *scratch = context;
-    DEBUG_PRINTF("masky got himself a match @%llu id %u !woot!\n", offset, id);
-
-    /* must be a external report as haig cannot directly participate in chain */
-    return roseNfaRunProgram(scratch->core_info.rose, scratch, from_offset,
-                             offset, id, 0);
-}
-
 static really_inline
-char blast_queue(const struct RoseEngine *t, struct hs_scratch *scratch,
-                 struct mq *q, u32 qi, s64a to_loc, char report_current) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-
-    tctxt->curr_qi = qi;
-    if (info->only_external) {
-        q->cb = roseNfaBlastAdaptorNoInternal;
-    } else {
-        q->cb = roseNfaBlastAdaptor;
-    }
+char blast_queue(struct hs_scratch *scratch, struct mq *q, u32 qi, s64a to_loc,
+                 char report_current) {
+    scratch->tctxt.curr_qi = qi;
+    q->cb = roseNfaBlastAdaptor;
     q->report_current = report_current;
-    q->som_cb = roseNfaBlastSomAdaptor;
     DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n", qi, q->cur, q->end,
                  q_cur_loc(q), to_loc);
     char alive = nfaQueueExec(q->nfa, q, to_loc);
     q->cb = roseNfaAdaptor;
-    q->som_cb = roseNfaSomAdaptor;
     assert(!q->report_current);
 
     return alive;
@@ -549,7 +518,7 @@ hwlmcb_rv_t buildSufPQ_final(const struct RoseEngine *t, s64a report_ok_loc,
 
     ensureEnd(q, a_qi, final_loc);
 
-    char alive = blast_queue(t, scratch, q, a_qi, second_place_loc, 0);
+    char alive = blast_queue(scratch, q, a_qi, second_place_loc, 0);
 
     /* We have three possible outcomes:
      * (1) the nfa died
@@ -754,7 +723,7 @@ hwlmcb_rv_t buildSufPQ(const struct RoseEngine *t, char *state, s64a safe_loc,
             = scratch->catchup_pq.qm_size ? pq_top_loc(&scratch->catchup_pq)
                                           : safe_loc;
         second_place_loc = MIN(second_place_loc, safe_loc);
-        if (n_qi == MMB_INVALID && report_ok_loc < second_place_loc) {
+        if (n_qi == MMB_INVALID && report_ok_loc <= second_place_loc) {
             if (buildSufPQ_final(t, report_ok_loc, second_place_loc, final_loc,
                                  scratch, aa, a_qi)
                 == HWLM_TERMINATE_MATCHING) {
@@ -845,7 +814,7 @@ hwlmcb_rv_t roseCatchUpNfas(const struct RoseEngine *t, s64a loc,
             continue;
         }
 
-        char alive = blast_queue(t, scratch, q, qi, second_place_loc, 1);
+        char alive = blast_queue(scratch, q, qi, second_place_loc, 1);
 
         if (!alive) {
             if (can_stop_matching(scratch)) {
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index d36ed272..76db5a77 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@
 #include "rose_internal.h"
 #include "nfa/nfa_api_queue.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /** \brief Maximum number of bytes to scan when looking for a "counting miracle"
  * stop character. */
@@ -83,7 +82,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
 }
 
 #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
 static really_inline
 u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
diff --git a/src/rose/eod.c b/src/rose/eod.c
deleted file mode 100644
index 7e8d4b3d..00000000
--- a/src/rose/eod.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "catchup.h"
-#include "match.h"
-#include "program_runtime.h"
-#include "rose.h"
-#include "util/fatbit.h"
-
-static really_inline
-void initContext(const struct RoseEngine *t, char *state, u64a offset,
-                 struct hs_scratch *scratch) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    tctxt->groups = loadGroups(t, state); /* TODO: diff groups for eod */
-    tctxt->lit_offset_adjust = scratch->core_info.buf_offset
-                             - scratch->core_info.hlen
-                             + 1; // index after last byte
-    tctxt->delayLastEndOffset = offset;
-    tctxt->lastEndOffset = offset;
-    tctxt->filledDelayedSlots = 0;
-    tctxt->lastMatchOffset = 0;
-    tctxt->minMatchOffset = offset;
-    tctxt->minNonMpvMatchOffset = offset;
-    tctxt->next_mpv_offset = offset;
-
-    scratch->catchup_pq.qm_size = 0;
-    scratch->al_log_sum = 0; /* clear the anchored logs */
-
-    fatbit_clear(scratch->aqa);
-}
-
-static rose_inline
-hwlmcb_rv_t roseEodRunMatcher(const struct RoseEngine *t, u64a offset,
-                              struct hs_scratch *scratch,
-                              const char is_streaming) {
-    assert(t->ematcherOffset);
-
-    size_t eod_len;
-    const u8 *eod_data;
-    if (!is_streaming) { /* Block */
-        eod_data = scratch->core_info.buf;
-        eod_len = scratch->core_info.len;
-    } else { /* Streaming */
-        eod_len = scratch->core_info.hlen;
-        eod_data = scratch->core_info.hbuf;
-    }
-
-    assert(eod_data);
-    assert(eod_len);
-
-    // If we don't have enough bytes to produce a match from an EOD table scan,
-    // there's no point scanning.
-    if (eod_len < t->eodmatcherMinWidth) {
-        DEBUG_PRINTF("len=%zu < eodmatcherMinWidth=%u\n", eod_len,
-                     t->eodmatcherMinWidth);
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    // Ensure that we only need scan the last N bytes, where N is the length of
-    // the eod-anchored matcher region.
-    size_t adj = eod_len - MIN(eod_len, t->ematcherRegionSize);
-
-    DEBUG_PRINTF("eod offset=%llu, eod length=%zu\n", offset, eod_len);
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct HWLM *etable = getELiteralMatcher(t);
-
-    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
-             tctxt->groups);
-
-    // We may need to fire delayed matches
-    return cleanUpDelayed(t, scratch, 0, offset);
-}
-
-static rose_inline
-int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
-                       struct hs_scratch *scratch) {
-    if (!t->eodIterProgramOffset) {
-        return MO_CONTINUE_MATCHING;
-    }
-
-    DEBUG_PRINTF("running eod program at offset %u\n", t->eodIterProgramOffset);
-
-    const u64a som = 0;
-    const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
-    if (roseRunProgram(t, scratch, t->eodIterProgramOffset, som, offset,
-                       match_len, in_anchored, in_catchup,
-                       from_mpv, skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
-        return MO_HALT_MATCHING;
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-/**
- * \brief Check for (and deliver) reports from active output-exposed (suffix
- * or outfix) NFAs.
- *
- * \return MO_HALT_MATCHING if the user instructs us to stop.
- */
-static rose_inline
-int roseCheckNfaEod(const struct RoseEngine *t, char *state,
-                     struct hs_scratch *scratch, u64a offset,
-                     const char is_streaming) {
-    if (!t->eodNfaIterOffset) {
-        DEBUG_PRINTF("no engines that report at EOD\n");
-        return MO_CONTINUE_MATCHING;
-    }
-
-    /* data, len is used for state decompress, should be full available data */
-    u8 key = 0;
-    if (is_streaming) {
-        const u8 *eod_data = scratch->core_info.hbuf;
-        size_t eod_len = scratch->core_info.hlen;
-        key = eod_len ? eod_data[eod_len - 1] : 0;
-    }
-
-    const u8 *aa = getActiveLeafArray(t, state);
-    const u32 aaCount = t->activeArrayCount;
-
-    const struct mmbit_sparse_iter *it = getByOffset(t, t->eodNfaIterOffset);
-    assert(ISALIGNED(it));
-
-    u32 idx = 0;
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
-         qi != MMB_INVALID;
-         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-        const struct NFA *nfa = getNfaByInfo(t, info);
-
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        assert(nfaAcceptsEod(nfa));
-
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = (const char *)state + info->stateOffset;
-
-        if (is_streaming) {
-            // Decompress stream state.
-            nfaExpandState(nfa, fstate, sstate, offset, key);
-        }
-
-        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
-                               roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return MO_HALT_MATCHING;
-        }
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-static rose_inline
-void cleanupAfterEodMatcher(const struct RoseEngine *t, u64a offset,
-                            struct hs_scratch *scratch) {
-    // Flush history to make sure it's consistent.
-    roseFlushLastByteHistory(t, scratch, offset);
-}
-
-static rose_inline
-void roseCheckEodSuffixes(const struct RoseEngine *t, char *state, u64a offset,
-                          struct hs_scratch *scratch) {
-    const u8 *aa = getActiveLeafArray(t, state);
-    const u32 aaCount = t->activeArrayCount;
-    UNUSED u32 qCount = t->queueCount;
-
-    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
-         qi = mmbit_iterate(aa, aaCount, qi)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-        const struct NFA *nfa = getNfaByInfo(t, info);
-
-        assert(nfaAcceptsEod(nfa));
-
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-
-        assert(fatbit_isset(scratch->aqa, qCount, qi)); /* we have just been
-                                                           triggered */
-
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = (const char *)state + info->stateOffset;
-
-        struct mq *q = scratch->queues + qi;
-
-        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
-
-        q->context = NULL;
-        /* rose exec is used as we don't want to / can't raise matches in the
-         * history buffer. */
-        char rv = nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-        if (rv) { /* nfa is still alive */
-            if (nfaCheckFinalState(nfa, fstate, sstate, offset,
-                                   roseReportAdaptor, roseReportSomAdaptor,
-                                   scratch) == MO_HALT_MATCHING) {
-                DEBUG_PRINTF("user instructed us to stop\n");
-                return;
-            }
-        }
-    }
-}
-
-static rose_inline
-int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch) {
-    if (!t->eodProgramOffset) {
-        return MO_CONTINUE_MATCHING;
-    }
-
-    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
-
-    // There should be no pending delayed literals.
-    assert(!scratch->tctxt.filledDelayedSlots);
-
-    const u64a som = 0;
-    const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
-    if (roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                       in_anchored, in_catchup, from_mpv,
-                       skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
-        return MO_HALT_MATCHING;
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-static really_inline
-void roseEodExec_i(const struct RoseEngine *t, char *state, u64a offset,
-                   struct hs_scratch *scratch, const char is_streaming) {
-    assert(t);
-    assert(scratch->core_info.buf || scratch->core_info.hbuf);
-    assert(!scratch->core_info.buf || !scratch->core_info.hbuf);
-    assert(!can_stop_matching(scratch));
-
-    // Run the unconditional EOD program.
-    if (roseRunEodProgram(t, offset, scratch) == MO_HALT_MATCHING) {
-        return;
-    }
-
-    if (roseCheckNfaEod(t, state, scratch, offset, is_streaming) ==
-        MO_HALT_MATCHING) {
-        return;
-    }
-
-    if (!t->eodIterProgramOffset && !t->ematcherOffset) {
-        DEBUG_PRINTF("no eod accepts\n");
-        return;
-    }
-
-    // Handle pending EOD reports.
-    if (roseEodRunIterator(t, offset, scratch) == MO_HALT_MATCHING) {
-        return;
-    }
-
-    // Run the EOD anchored matcher if there is one.
-    if (t->ematcherOffset) {
-        assert(t->ematcherRegionSize);
-        // Unset the reports we just fired so we don't fire them again below.
-        mmbit_clear(getRoleState(state), t->rolesWithStateCount);
-        mmbit_clear(getActiveLeafArray(t, state), t->activeArrayCount);
-
-        if (roseEodRunMatcher(t, offset, scratch, is_streaming) ==
-            HWLM_TERMINATE_MATCHING) {
-            return;
-        }
-
-        cleanupAfterEodMatcher(t, offset, scratch);
-
-        // Fire any new EOD reports.
-        if (roseEodRunIterator(t, offset, scratch) == MO_HALT_MATCHING) {
-            return;
-        }
-
-        roseCheckEodSuffixes(t, state, offset, scratch);
-    }
-}
-
-void roseEodExec(const struct RoseEngine *t, u64a offset,
-                 struct hs_scratch *scratch) {
-    assert(scratch);
-    assert(t->requiresEodCheck);
-    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
-                 scratch->core_info.len, scratch->core_info.hbuf,
-                 scratch->core_info.hlen);
-
-    // We should not have been called if we've already been told to terminate
-    // matching.
-    assert(!told_to_stop_matching(scratch));
-
-    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
-        && offset > t->maxBiAnchoredWidth) {
-        DEBUG_PRINTF("bailing, we are beyond max width\n");
-        /* also some of the history/state may be stale */
-        return;
-    }
-
-    char *state = scratch->core_info.state;
-    assert(state);
-
-    initContext(t, state, offset, scratch);
-
-    roseEodExec_i(t, state, offset, scratch, 1);
-}
-
-static rose_inline
-void prepForEod(const struct RoseEngine *t, struct hs_scratch *scratch,
-                size_t length) {
-    roseFlushLastByteHistory(t, scratch, length);
-    scratch->tctxt.lastEndOffset = length;
-}
-
-void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch) {
-    assert(t->requiresEodCheck);
-    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
-           || offset <= t->maxBiAnchoredWidth);
-
-    assert(!can_stop_matching(scratch));
-
-    char *state = scratch->core_info.state;
-
-    // Ensure that history is correct before we look for EOD matches
-    prepForEod(t, scratch, scratch->core_info.len);
-
-    roseEodExec_i(t, state, offset, scratch, 0);
-}
diff --git a/src/rose/match.c b/src/rose/match.c
index 4e9e72a6..b641e39d 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -27,14 +27,9 @@
  */
 
 #include "catchup.h"
-#include "counting_miracle.h"
-#include "infix.h"
 #include "match.h"
-#include "miracle.h"
 #include "program_runtime.h"
-#include "rose_program.h"
 #include "rose.h"
-#include "som/som_runtime.h"
 #include "util/bitutils.h"
 #include "util/fatbit.h"
 
@@ -98,13 +93,9 @@ hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
     if (program) {
         const u64a som = 0;
         const size_t match_len = end - start + 1;
-        const char in_anchored = 0;
-        const char in_catchup = 0;
-        const char from_mpv = 0;
-        const char skip_mpv_catchup = 0;
-        UNUSED hwlmcb_rv_t rv =
-            roseRunProgram(t, scratch, program, som, real_end, match_len,
-                           in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+        const u8 flags = 0;
+        UNUSED hwlmcb_rv_t rv = roseRunProgram(t, scratch, program, som,
+                                               real_end, match_len, flags);
         assert(rv != HWLM_TERMINATE_MATCHING);
     }
 
@@ -121,28 +112,6 @@ hwlmcb_rv_t ensureMpvQueueFlushed(const struct RoseEngine *t,
     return ensureQueueFlushed_i(t, scratch, qi, loc, 1, in_chained);
 }
 
-static rose_inline
-void recordAnchoredLiteralMatch(const struct RoseEngine *t,
-                                struct hs_scratch *scratch, u32 literal_id,
-                                u64a end) {
-    assert(end);
-    struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
-
-    DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
-
-    if (!bf64_set(&scratch->al_log_sum, end - 1)) {
-        // first time, clear row
-        DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
-        fatbit_clear(anchoredLiteralRows[end - 1]);
-    }
-
-    u32 rel_idx = literal_id - t->anchored_base_id;
-    DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
-                 t->anchored_count);
-    assert(rel_idx < t->anchored_count);
-    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
-}
-
 hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t,
                                  struct hs_scratch *scratch, u32 event,
                                  u64a top_squash_distance, u64a end,
@@ -220,8 +189,9 @@ event_enqueued:
     return HWLM_CONTINUE_MATCHING;
 }
 
-int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
     struct hs_scratch *scratch = ctx;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     struct RoseContext *tctxt = &scratch->tctxt;
     struct core_info *ci = &scratch->core_info;
     const struct RoseEngine *t = ci->rose;
@@ -250,16 +220,10 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
         tctxt->lastEndOffset = real_end;
     }
 
-    const u32 *programs = getByOffset(t, t->litProgramOffset);
-    assert(id < t->literalCount);
-    const u64a som = 0;
-    const char in_anchored = 1;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    if (roseRunProgram(t, scratch, programs[id], som, real_end, match_len,
-                       in_anchored, in_catchup, from_mpv,
-                       skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
+    // Note that the "id" we have been handed is the program offset.
+    const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
+    if (roseRunProgram(t, scratch, id, start, real_end, match_len,
+                       flags) == HWLM_TERMINATE_MATCHING) {
         assert(can_stop_matching(scratch));
         DEBUG_PRINTF("caller requested termination\n");
         return MO_HALT_MATCHING;
@@ -267,15 +231,34 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
 
     DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
-    if (real_end > t->floatingMinLiteralMatchOffset) {
-        recordAnchoredLiteralMatch(t, scratch, id, real_end);
-    }
-
     return MO_CONTINUE_MATCHING;
 }
 
-// Rose match-processing workhorse
-/* assumes not in_anchored */
+/**
+ * \brief Run the program for the given literal ID, with the interpreter
+ * inlined into this call.
+ *
+ * Assumes not in_anchored.
+ */
+static really_inline
+hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u64a end,
+                             size_t match_len, u32 id) {
+    DEBUG_PRINTF("id=%u\n", id);
+    const u32 *programs = getByOffset(t, t->litProgramOffset);
+    assert(id < t->literalCount);
+    const u64a som = 0;
+    const u8 flags = 0;
+    return roseRunProgram_i(t, scratch, programs[id], som, end, match_len,
+                            flags);
+}
+
+/**
+ * \brief Run the program for the given literal ID, with the interpreter
+ * out of line.
+ *
+ * Assumes not in_anchored.
+ */
 static really_inline
 hwlmcb_rv_t roseProcessMatch(const struct RoseEngine *t,
                              struct hs_scratch *scratch, u64a end,
@@ -284,12 +267,8 @@ hwlmcb_rv_t roseProcessMatch(const struct RoseEngine *t,
     const u32 *programs = getByOffset(t, t->litProgramOffset);
     assert(id < t->literalCount);
     const u64a som = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    return roseRunProgram(t, scratch, programs[id], som, end, match_len,
-                          in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+    const u8 flags = 0;
+    return roseRunProgram(t, scratch, programs[id], som, end, match_len, flags);
 }
 
 static rose_inline
@@ -516,7 +495,8 @@ anchored_leftovers:;
     return rv;
 }
 
-hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
+static really_inline
+hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
     struct hs_scratch *scratch = ctxt;
     struct RoseContext *tctx = &scratch->tctxt;
     const struct RoseEngine *t = scratch->core_info.rose;
@@ -551,7 +531,7 @@ hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
     }
 
     size_t match_len = end - start + 1;
-    rv = roseProcessMatch(t, scratch, real_end, match_len, id);
+    rv = roseProcessMatchInline(t, scratch, real_end, match_len, id);
 
     DEBUG_PRINTF("DONE groups=0x%016llx\n", tctx->groups);
 
@@ -564,30 +544,15 @@ hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
     return HWLM_TERMINATE_MATCHING;
 }
 
-/**
- * \brief Match callback adaptor used for matches from pure-literal cases.
- *
- * Literal match IDs in this path run limited Rose programs that do not use
- * Rose state (which is not initialised in the pure-literal path). They can
- * still, for example, check lookarounds or literal masks.
- */
-hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
-                                    void *context) {
-    DEBUG_PRINTF("start=%zu, end=%zu, id=%u\n", start, end, id);
-    struct hs_scratch *scratch = context;
-    struct core_info *ci = &scratch->core_info;
-    const u64a real_end = (u64a)end + ci->buf_offset + 1;
-    const u64a som = 0;
-    const size_t match_len = end - start + 1;
-    const struct RoseEngine *rose = ci->rose;
-    const u32 *programs = getByOffset(rose, rose->litProgramOffset);
-    assert(id < rose->literalCount);
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    return roseRunProgram(rose, scratch, programs[id], som, real_end, match_len,
-                          in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
+    return roseCallback_i(start, end, id, ctxt);
+}
+
+hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctxt) {
+    struct hs_scratch *scratch = ctxt;
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    return roseCallback_i(start, end, id, ctxt) & t->floating_group_mask;
 }
 
 /**
@@ -623,13 +588,9 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
 
     const u64a som = 0;
     const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, som, stream_offset, match_len,
-                       in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+    const u8 flags = 0;
+    hwlmcb_rv_t rv = roseRunProgram(rose, scratch, program, som, stream_offset,
+                                    match_len, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
@@ -637,36 +598,23 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
     return MO_CONTINUE_MATCHING;
 }
 
-static really_inline
-int roseReportAdaptor_i(u64a som, u64a offset, ReportID id, void *context) {
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
     assert(scratch && scratch->magic == SCRATCH_MAGIC);
 
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
     const struct RoseEngine *rose = scratch->core_info.rose;
 
     // Our match ID is the program offset.
     const u32 program = id;
     const size_t match_len = 0; // Unused in this path.
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
     hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, som, offset, match_len,
-                       in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+        roseRunProgram(rose, scratch, program, start, end, match_len, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
 
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
-
-int roseReportAdaptor(u64a offset, ReportID id, void *context) {
-    DEBUG_PRINTF("offset=%llu, id=%u\n", offset, id);
-    return roseReportAdaptor_i(0, offset, id, context);
-}
-
-int roseReportSomAdaptor(u64a som, u64a offset, ReportID id, void *context) {
-    DEBUG_PRINTF("som=%llu, offset=%llu, id=%u\n", som, offset, id);
-    return roseReportAdaptor_i(som, offset, id, context);
-}
diff --git a/src/rose/match.h b/src/rose/match.h
index cee32fc2..b69ff158 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -29,31 +29,34 @@
 #ifndef ROSE_MATCH_H
 #define ROSE_MATCH_H
 
-#include "hwlm/hwlm.h"
+#include "catchup.h"
 #include "runtime.h"
 #include "scratch.h"
+#include "report.h"
 #include "rose_common.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "hwlm/hwlm.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_api_util.h"
 #include "som/som_runtime.h"
 #include "util/bitutils.h"
+#include "util/exhaust.h"
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
 /* Callbacks, defined in catchup.c */
 
-int roseNfaAdaptor(u64a offset, ReportID id, void *context);
-int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id, void *context);
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context);
 
 /* Callbacks, defined in match.c */
 
 hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctx);
+hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
                                      void *ctx);
-int roseAnchoredCallback(u64a end, u32 id, void *ctx);
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx);
 
 /* Common code, used all over Rose runtime */
 
@@ -78,7 +81,6 @@ void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
     q->history = scratch->core_info.hbuf;
     q->hlength = scratch->core_info.hlen;
     q->cb = roseNfaAdaptor;
-    q->som_cb = roseNfaSomAdaptor;
     q->context = scratch;
     q->report_current = 0;
 
@@ -294,4 +296,85 @@ int roseHasInFlightMatches(const struct RoseEngine *t, char *state,
     return 0;
 }
 
+static rose_inline
+hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t,
+                                struct hs_scratch *scratch) {
+    struct core_info *ci = &scratch->core_info;
+    if (isAllExhausted(t, ci->exhaustionVector)) {
+        ci->status |= STATUS_EXHAUSTED;
+        scratch->tctxt.groups = 0;
+        DEBUG_PRINTF("all exhausted, termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u32 qi, s64a loc,
+                                 char is_mpv, char in_catchup) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
+    struct fatbit *activeQueues = scratch->aqa;
+    u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+
+    struct mq *q = &scratch->queues[qi];
+    DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n",
+                 q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset);
+    if (q_cur_loc(q) == loc) {
+        /* too many tops enqueued at the one spot; need to flatten this queue.
+         * We can use the full catchups as it will short circuit as we are
+         * already at this location. It also saves waking everybody up */
+        pushQueueNoMerge(q, MQE_END, loc);
+        nfaQueueExec(q->nfa, q, loc);
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else if (!in_catchup) {
+        if (is_mpv) {
+            tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+            if (loc + scratch->core_info.buf_offset
+                <= tctxt->minNonMpvMatchOffset) {
+                DEBUG_PRINTF("flushing chained\n");
+                if (roseCatchUpMPV(t, loc, scratch) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                goto done_queue_empty;
+            }
+        }
+
+        if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) ==
+            HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    } else {
+        /* we must be a chained nfa */
+        assert(is_mpv);
+        DEBUG_PRINTF("flushing chained\n");
+        tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+        if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+done_queue_empty:
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(q->nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(activeQueues, qCount, qi);
+    }
+
+    assert(!isQueueFull(q));
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t,
+                               struct hs_scratch *scratch, u32 qi, s64a loc) {
+    return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0);
+}
+
 #endif
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
new file mode 100644
index 00000000..23532d40
--- /dev/null
+++ b/src/rose/program_runtime.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
+#include "program_runtime.h"
+
+int roseNfaEarliestSom(u64a start, UNUSED u64a end, UNUSED ReportID id,
+                       void *context) {
+    assert(context);
+    u64a *som = context;
+    *som = MIN(*som, start);
+    return MO_CONTINUE_MATCHING;
+}
+
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, size_t match_len,
+                           u8 prog_flags) {
+    return roseRunProgram_i(t, scratch, programOffset, som, end, match_len,
+                            prog_flags);
+}
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 78397070..8bf41715 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -26,6 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
 #ifndef PROGRAM_RUNTIME_H
 #define PROGRAM_RUNTIME_H
 
@@ -39,13 +44,32 @@
 #include "rose_internal.h"
 #include "rose_program.h"
 #include "rose_types.h"
+#include "validate_mask.h"
 #include "runtime.h"
 #include "scratch.h"
 #include "ue2common.h"
+#include "hwlm/hwlm.h" // for hwlmcb_rv_t
 #include "util/compare.h"
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
+/*
+ * Program context flags, which control the behaviour of some instructions at
+ * based on runtime contexts (whether the program is triggered by the anchored
+ * matcher, engine catchup, etc).
+ */
+
+#define ROSE_PROG_FLAG_IN_ANCHORED          1
+#define ROSE_PROG_FLAG_IN_CATCHUP           2
+#define ROSE_PROG_FLAG_FROM_MPV             4
+#define ROSE_PROG_FLAG_SKIP_MPV_CATCHUP     8
+
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, size_t match_len, u8 prog_flags);
+
+/* Inline implementation follows. */
+
 static rose_inline
 int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
                       const u8 *and_mask, const u8 *exp_mask) {
@@ -141,6 +165,33 @@ void rosePushDelayedMatch(const struct RoseEngine *t,
     fatbit_set(slot, delay_count, delay_index);
 }
 
+static rose_inline
+void recordAnchoredLiteralMatch(const struct RoseEngine *t,
+                                struct hs_scratch *scratch, u32 literal_id,
+                                u64a end) {
+    assert(end);
+
+    if (end <= t->floatingMinLiteralMatchOffset) {
+        return;
+    }
+
+    struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
+
+    DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
+
+    if (!bf64_set(&scratch->al_log_sum, end - 1)) {
+        // first time, clear row
+        DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
+        fatbit_clear(anchoredLiteralRows[end - 1]);
+    }
+
+    u32 rel_idx = literal_id - t->anchored_base_id;
+    DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
+                 t->anchored_count);
+    assert(rel_idx < t->anchored_count);
+    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
+}
+
 static rose_inline
 char roseLeftfixCheckMiracles(const struct RoseEngine *t,
                               const struct LeftNfaInfo *left,
@@ -208,87 +259,6 @@ found_miracle:
     return 1;
 }
 
-static rose_inline
-hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t,
-                                struct hs_scratch *scratch) {
-    struct core_info *ci = &scratch->core_info;
-    if (isAllExhausted(t, ci->exhaustionVector)) {
-        ci->status |= STATUS_EXHAUSTED;
-        scratch->tctxt.groups = 0;
-        DEBUG_PRINTF("all exhausted, termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static really_inline
-hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t,
-                                 struct hs_scratch *scratch, u32 qi, s64a loc,
-                                 char is_mpv, char in_catchup) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
-    struct fatbit *activeQueues = scratch->aqa;
-    u32 aaCount = t->activeArrayCount;
-    u32 qCount = t->queueCount;
-
-    struct mq *q = &scratch->queues[qi];
-    DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n",
-                 q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset);
-    if (q_cur_loc(q) == loc) {
-        /* too many tops enqueued at the one spot; need to flatten this queue.
-         * We can use the full catchups as it will short circuit as we are
-         * already at this location. It also saves waking everybody up */
-        pushQueueNoMerge(q, MQE_END, loc);
-        nfaQueueExec(q->nfa, q, loc);
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-    } else if (!in_catchup) {
-        if (is_mpv) {
-            tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
-            if (loc + scratch->core_info.buf_offset
-                <= tctxt->minNonMpvMatchOffset) {
-                DEBUG_PRINTF("flushing chained\n");
-                if (roseCatchUpMPV(t, loc, scratch) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                goto done_queue_empty;
-            }
-        }
-
-        if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) ==
-            HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    } else {
-        /* we must be a chained nfa */
-        assert(is_mpv);
-        DEBUG_PRINTF("flushing chained\n");
-        tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
-        if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-done_queue_empty:
-    if (!mmbit_set(aa, aaCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        nfaQueueInitState(q->nfa, q);
-        pushQueueAt(q, 0, MQE_START, loc);
-        fatbit_set(activeQueues, qCount, qi);
-    }
-
-    assert(!isQueueFull(q));
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-static rose_inline
-hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t,
-                               struct hs_scratch *scratch, u32 qi, s64a loc) {
-    return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0);
-}
-
 static rose_inline
 hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
                               struct hs_scratch *scratch, u32 qi, u32 top,
@@ -424,7 +394,7 @@ char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
     }
 
     s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
-    assert(loc >= q_cur_loc(q));
+    assert(loc >= q_cur_loc(q) || left->eager);
     assert(leftfixReport != MO_INVALID_IDX);
 
     if (!is_infix && left->transient) {
@@ -471,7 +441,13 @@ char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
         DEBUG_PRINTF("checking for report %u\n", leftfixReport);
         DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
         return rv == MO_MATCHES_PENDING;
+    } else if (q_cur_loc(q) > loc) {
+        /* an eager leftfix may have already progressed past loc if there is no
+         * match at loc. */
+        assert(left->eager);
+        return 0;
     } else {
+        assert(q_cur_loc(q) == loc);
         DEBUG_PRINTF("checking for report %u\n", leftfixReport);
         char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
         DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
@@ -660,6 +636,153 @@ int reachHasBit(const u8 *reach, u8 c) {
     return !!(reach[c / 8U] & (u8)1U << (c % 8U));
 }
 
+/*
+ * Generate a 8-byte valid_mask with #high bytes 0 from the highest side
+ * and #low bytes 0 from the lowest side
+ * and (8 - high - low) bytes '0xff' in the middle.
+ */
+static rose_inline
+u64a generateValidMask(const s32 high, const s32 low) {
+    assert(high + low < 8);
+    DEBUG_PRINTF("high %d low %d\n", high, low);
+    const u64a ones = ~0ull;
+    return (ones << ((high + low) * 8)) >> (high * 8);
+}
+
+/*
+ * Do the single-byte check if only one lookaround entry exists
+ * and it's a single mask.
+ * Return success if the byte is in the future or before history
+ * (offset is greater than (history) buffer length).
+ */
+static rose_inline
+int roseCheckByte(const struct core_info *ci, u8 and_mask, u8 cmp_mask,
+                  u8 negation, s32 checkOffset, u64a end) {
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    const s64a base_offset = end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset);
+    u8 c;
+    if (offset >= 0) {
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            return 1;
+        } else {
+            assert(offset < (s64a)ci->len);
+            DEBUG_PRINTF("check byte in buffer\n");
+            c = ci->buf[offset];
+        }
+    } else {
+        if (offset >= -(s64a) ci->hlen) {
+            DEBUG_PRINTF("check byte in history\n");
+            c = ci->hbuf[ci->hlen + offset];
+        } else {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+    }
+
+    if (((and_mask & c) != cmp_mask) ^ negation) {
+        DEBUG_PRINTF("char 0x%02x at offset %lld failed byte check\n",
+                     c, offset);
+        return 0;
+    }
+
+    DEBUG_PRINTF("real offset=%lld char=%02x\n", offset, c);
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static rose_inline
+int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask,
+                  u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("rel offset %lld\n",base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a data = 0;
+    u64a valid_data_mask = ~0ULL; // mask for validate check.
+    //A 0xff byte means that this byte is in the buffer.
+    s32 shift_l = 0; // size of bytes in the future.
+    s32 shift_r = 0; // size of bytes before the history.
+    s32 h_len = 0; // size of bytes in the history buffer.
+    s32 c_len = 8; // size of bytes in the current buffer.
+    if (offset < 0) {
+        // in or before history buffer.
+        if (offset + 8 <= -(s64a)ci->hlen) {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+        const u8 *h_start = ci->hbuf; // start pointer in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            // some bytes are before history.
+            shift_r = -(offset + (s64a)ci->hlen);
+            DEBUG_PRINTF("shift_r %d", shift_r);
+        } else {
+            h_start += ci->hlen + offset;
+        }
+        if (offset + 7 < 0) {
+            DEBUG_PRINTF("all in history buffer\n");
+            data = partial_load_u64a(h_start, 8 - shift_r);
+        } else {
+            // history part
+            c_len = offset + 8;
+            h_len = -offset - shift_r;
+            DEBUG_PRINTF("%d bytes in history\n", h_len);
+            s64a data_h = 0;
+            data_h = partial_load_u64a(h_start, h_len);
+            // current part
+            if (c_len > (s64a)ci->len) {
+                shift_l = c_len - ci->len;
+                c_len = ci->len;
+            }
+            data = partial_load_u64a(ci->buf, c_len);
+            data <<= h_len << 3;
+            data |= data_h;
+        }
+        if (shift_r) {
+            data <<= shift_r << 3;
+        }
+    } else {
+        // current buffer.
+        if (offset + c_len > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future\n");
+                return 1;
+            }
+            // some  bytes in the future.
+            shift_l = offset + c_len - ci->len;
+            c_len = ci->len - offset;
+            data = partial_load_u64a(ci->buf + offset, c_len);
+        } else {
+            data = unaligned_load_u64a(ci->buf + offset);
+        }
+    }
+
+    if (shift_l || shift_r) {
+        valid_data_mask = generateValidMask(shift_l, shift_r);
+    }
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+
+    if (validateMask(data, valid_data_mask,
+                     and_mask, cmp_mask, neg_mask)) {
+        DEBUG_PRINTF("check mask successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
 /**
  * \brief Scan around a literal, checking that that "lookaround" reach masks
  * are satisfied.
@@ -754,13 +877,7 @@ int roseCheckLookaround(const struct RoseEngine *t,
     return 1;
 }
 
-static
-int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
-                       void *context) {
-    u64a *som = context;
-    *som = MIN(*som, from_offset);
-    return MO_CONTINUE_MATCHING;
-}
+int roseNfaEarliestSom(u64a start, u64a end, ReportID id, void *context);
 
 static rose_inline
 u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
@@ -780,13 +897,13 @@ u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
     u64a start = ~0ULL;
 
     /* switch the callback + context for a fun one */
-    q->som_cb = roseNfaEarliestSom;
+    q->cb = roseNfaEarliestSom;
     q->context = &start;
 
     nfaReportCurrentMatches(q->nfa, q);
 
     /* restore the old callback + context */
-    q->som_cb = roseNfaSomAdaptor;
+    q->cb = roseNfaAdaptor;
     q->context = NULL;
     DEBUG_PRINTF("earliest som is %llu\n", start);
     return start;
@@ -800,6 +917,144 @@ char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
     return end >= min_bound && end <= max_bound;
 }
 
+static rose_inline
+hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset,
+                           u32 iter_offset) {
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    /* data, len is used for state decompress, should be full available data */
+    u8 key = 0;
+    if (is_streaming) {
+        const u8 *eod_data = scratch->core_info.hbuf;
+        size_t eod_len = scratch->core_info.hlen;
+        key = eod_len ? eod_data[eod_len - 1] : 0;
+    }
+
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
+    assert(ISALIGNED(it));
+
+    u32 idx = 0;
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
+         qi != MMB_INVALID;
+         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        if (is_streaming) {
+            // Decompress stream state.
+            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
+        }
+
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a offset) {
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        /* We have just been triggered. */
+        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
+
+        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
+        q->context = NULL;
+
+        /* rose exec is used as we don't want to / can't raise matches in the
+         * history buffer. */
+        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
+            DEBUG_PRINTF("nfa is dead\n");
+            continue;
+        }
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset) {
+    assert(rose->ematcherOffset);
+    assert(rose->ematcherRegionSize);
+
+    // Clear role state and active engines, since we have already handled all
+    // outstanding work there.
+    DEBUG_PRINTF("clear role state and active leaf array\n");
+    char *state = scratch->core_info.state;
+    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
+    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
+
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    size_t eod_len;
+    const u8 *eod_data;
+    if (!is_streaming) { /* Block */
+        eod_data = scratch->core_info.buf;
+        eod_len = scratch->core_info.len;
+    } else { /* Streaming */
+        eod_len = scratch->core_info.hlen;
+        eod_data = scratch->core_info.hbuf;
+    }
+
+    assert(eod_data);
+    assert(eod_len);
+
+    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
+                 offset);
+
+    // If we don't have enough bytes to produce a match from an EOD table scan,
+    // there's no point scanning.
+    if (eod_len < rose->eodmatcherMinWidth) {
+        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    // Ensure that we only need scan the last N bytes, where N is the length of
+    // the eod-anchored matcher region.
+    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
+
+    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
+    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
+             scratch->tctxt.groups);
+
+    // We may need to fire delayed matches.
+    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    roseFlushLastByteHistory(rose, scratch, offset);
+    return HWLM_CONTINUE_MATCHING;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -823,16 +1078,21 @@ void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
     }
 
 static rose_inline
-hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
-                           struct hs_scratch *scratch, u32 programOffset,
-                           u64a som, u64a end, size_t match_len,
-                           char in_anchored, char in_catchup, char from_mpv,
-                           char skip_mpv_catchup) {
-    DEBUG_PRINTF("program=%u, offsets [%llu,%llu]\n", programOffset, som, end);
+hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u32 programOffset,
+                             u64a som, u64a end, size_t match_len,
+                             u8 prog_flags) {
+    DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
+                 som, end, prog_flags);
 
     assert(programOffset >= sizeof(struct RoseEngine));
     assert(programOffset < t->size);
 
+    const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED;
+    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
+    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
     const char *pc_base = getByOffset(t, programOffset);
     const char *pc = pc_base;
 
@@ -880,9 +1140,9 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_LIT_EARLY) {
-                if (end < t->floatingMinLiteralMatchOffset) {
-                    DEBUG_PRINTF("halt: too soon, min offset=%u\n",
-                                 t->floatingMinLiteralMatchOffset);
+                if (end < ri->min_offset) {
+                    DEBUG_PRINTF("halt: before min_offset=%u\n",
+                                 ri->min_offset);
                     return HWLM_CONTINUE_MATCHING;
                 }
             }
@@ -941,6 +1201,30 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->neg_mask, ri->offset, end)) {
+                    DEBUG_PRINTF("failed mask check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->negation, ri->offset, end)) {
+                    DEBUG_PRINTF("failed byte check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
                                    end)) {
@@ -968,6 +1252,11 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(RECORD_ANCHORED) {
+                recordAnchoredLiteralMatch(t, scratch, ri->id, end);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CATCH_UP) {
                 if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
                     return HWLM_TERMINATE_MATCHING;
@@ -1301,6 +1590,30 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(ENGINES_EOD) {
+                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {
+                if (roseSuffixesEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {
+                if (roseMatcherEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) {
                 DEBUG_PRINTF("finished\n");
                 return HWLM_CONTINUE_MATCHING;
diff --git a/src/rose/rose.h b/src/rose/rose.h
index e90d2f21..9a50f0e9 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -29,106 +29,26 @@
 #ifndef ROSE_H
 #define ROSE_H
 
-#include "rose_types.h"
-#include "rose_internal.h"
-#include "runtime.h"
-#include "scratch.h"
 #include "ue2common.h"
-#include "util/multibit.h"
+
+struct RoseEngine;
+struct hs_scratch;
 
 // Initialise state space for engine use.
 void roseInitState(const struct RoseEngine *t, char *state);
 
-void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch);
-void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch);
-
-static really_inline
-int roseBlockHasEodWork(const struct RoseEngine *t,
-                        struct hs_scratch *scratch) {
-    if (t->ematcherOffset) {
-        DEBUG_PRINTF("eod matcher to run\n");
-        return 1;
-    }
-
-    if (t->eodProgramOffset) {
-        DEBUG_PRINTF("has eod program\n");
-        return 1;
-    }
-
-    void *state = scratch->core_info.state;
-    if (mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
-        DEBUG_PRINTF("active outfix/suffix engines\n");
-        return 1;
-    }
-
-    if (t->eodIterOffset) {
-        u32 idx;
-        const struct mmbit_sparse_iter *it = getByOffset(t, t->eodIterOffset);
-        struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-        if (mmbit_sparse_iter_begin(getRoleState(state), t->rolesWithStateCount,
-                                    &idx, it, si_state) != MMB_INVALID) {
-            DEBUG_PRINTF("eod iter has states on\n");
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 /* assumes core_info in scratch has been init to point to data */
-static really_inline
-void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
-    assert(t);
-    assert(scratch);
-    assert(scratch->core_info.buf);
-
-    // We should not have been called if we've already been told to terminate
-    // matching.
-    assert(!told_to_stop_matching(scratch));
-
-    // If this block is shorter than our minimum width, then no pattern in this
-    // RoseEngine could match.
-    /* minWidth checks should have already been performed by the caller */
-    const size_t length = scratch->core_info.len;
-    assert(length >= t->minWidth);
-
-    // Similarly, we may have a maximum width (for engines constructed entirely
-    // of bi-anchored patterns).
-    /* This check is now handled by the interpreter */
-    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
-           || length <= t->maxBiAnchoredWidth);
-
-    roseBlockExec_i(t, scratch);
-
-    if (!t->requiresEodCheck) {
-        return;
-    }
-
-    if (can_stop_matching(scratch)) {
-        DEBUG_PRINTF("bailing, already halted\n");
-        return;
-    }
-
-    if (!roseBlockHasEodWork(t, scratch)) {
-        DEBUG_PRINTF("no eod work\n");
-        return;
-    }
-
-    roseBlockEodExec(t, length, scratch);
-}
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 
 /* assumes core_info in scratch has been init to point to data */
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 
-void roseEodExec(const struct RoseEngine *t, u64a offset,
-                 struct hs_scratch *scratch);
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch);
 
-hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
-                                    void *context);
+hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *context);
 
-int roseReportAdaptor(u64a offset, ReportID id, void *context);
-int roseReportSomAdaptor(u64a som, u64a offset, ReportID id, void *context);
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context);
 
 int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
                            u64a stream_offset, struct hs_scratch *scratch);
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index bef2114f..c71671fa 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,6 +65,7 @@ struct raw_som_dfa;
 class  CharReach;
 class  NGHolder;
 class  ReportManager;
+class  SmallWriteBuild;
 class  SomSlotManager;
 
 class RoseDedupeAux {
@@ -128,6 +129,7 @@ public:
 // Construct a usable Rose builder.
 std::unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
                                            SomSlotManager &ssm,
+                                           SmallWriteBuild &smwr,
                                            const CompileContext &cc,
                                            const BoundaryReports &boundary);
 
@@ -140,9 +142,6 @@ size_t roseSize(const RoseEngine *t);
  * intended to indicate a lightweight rose. */
 u32 roseQuality(const RoseEngine *t);
 
-ue2::aligned_unique_ptr<RoseEngine>
-roseAddSmallWrite(const RoseEngine *t, const SmallWriteEngine *smwr);
-
 bool roseIsPureLiteral(const RoseEngine *t);
 
 size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay);
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 23c122a7..0f0e8d18 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -315,7 +315,7 @@ void createVertices(RoseBuildImpl *tbi,
             w = created[key];
         }
 
-        NFAVertex p = pv.first;
+        RoseVertex p = pv.first;
 
         RoseEdge e;
         bool added;
@@ -375,7 +375,7 @@ void createVertices(RoseBuildImpl *tbi,
 /* ensure the holder does not accept any paths which do not end with lit */
 static
 void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
-    DEBUG_PRINTF("strip '%s'\n", ((const string &)lit).c_str());
+    DEBUG_PRINTF("strip '%s'\n", dumpString(lit).c_str());
     set<NFAVertex> curr, next;
     curr.insert(g.accept);
     curr.insert(g.acceptEod);
@@ -418,6 +418,7 @@ void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
     }
 
     pruneUseless(g);
+    clearReports(g);
     assert(in_degree(g.accept, g) || in_degree(g.acceptEod, g) > 1);
     assert(allMatchStatesHaveReports(g));
 
@@ -651,26 +652,93 @@ floating:
 }
 
 static
-unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h,
-                                       ReportID prefix_report) {
+unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build,
+                                   map<flat_set<ReportID>, ReportID> &remap) {
     assert(generates_callbacks(h));
-    auto g = cloneHolder(h);
-    g->kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX;
-    setReportId(*g, prefix_report);
+    assert(!in_degree(h.accept, h));
+    auto gg = cloneHolder(h);
+    NGHolder &g = *gg;
+    g.kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX;
 
     // Move acceptEod edges over to accept.
     vector<NFAEdge> dead;
-    for (const auto &e : in_edges_range(g->acceptEod, *g)) {
-        NFAVertex u = source(e, *g);
-        if (u == g->accept) {
+    for (const auto &e : in_edges_range(g.acceptEod, g)) {
+        NFAVertex u = source(e, g);
+        if (u == g.accept) {
             continue;
         }
-        add_edge_if_not_present(u, g->accept, *g);
+        add_edge_if_not_present(u, g.accept, g);
         dead.push_back(e);
+
+        if (!contains(remap, g[u].reports)) {
+            remap[g[u].reports] = build.getNewNfaReport();
+        }
+
+        g[u].reports = { remap[g[u].reports] };
     }
 
-    remove_edges(dead, *g);
-    return g;
+    remove_edges(dead, g);
+    return gg;
+}
+
+static
+u32 getEodEventID(RoseBuildImpl &build) {
+    // Allocate the EOD event if it hasn't been already.
+    if (build.eod_event_literal_id == MO_INVALID_IDX) {
+        build.eod_event_literal_id = build.getLiteralId({}, 0, ROSE_EVENT);
+    }
+
+    return build.eod_event_literal_id;
+}
+
+static
+void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
+                         const NGHolder &h) {
+    assert(!build.isInETable(u));
+
+    RoseGraph &g = build.g;
+    map<flat_set<ReportID>, ReportID> report_remap;
+    shared_ptr<NGHolder> eod_leftfix
+        = makeRoseEodPrefix(h, build, report_remap);
+
+    u32 eod_event = getEodEventID(build);
+
+    for (const auto &report_mapping : report_remap) {
+        RoseVertex v = add_vertex(g);
+        g[v].idx = build.vertexIndex++;
+        g[v].literals.insert(eod_event);
+        build.literal_info[eod_event].vertices.insert(v);
+
+        g[v].left.graph = eod_leftfix;
+        g[v].left.leftfix_report = report_mapping.second;
+        g[v].left.lag = 0;
+        RoseEdge e1 = add_edge(u, v, g).first;
+        g[e1].minBound = 0;
+        g[e1].maxBound = ROSE_BOUND_INF;
+        g[v].min_offset = add_rose_depth(g[u].min_offset,
+                                         findMinWidth(*g[v].left.graph));
+        g[v].max_offset = ROSE_BOUND_INF;
+
+        depth max_width = findMaxWidth(*g[v].left.graph);
+        if (u != build.root && max_width.is_finite()
+            && (!build.isAnyStart(u) || isPureAnchored(*g[v].left.graph))) {
+            g[e1].maxBound = max_width;
+            g[v].max_offset = add_rose_depth(g[u].max_offset, max_width);
+        }
+
+        g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
+        RoseVertex w = add_vertex(g);
+        g[w].idx = build.vertexIndex++;
+        g[w].eod_accept = true;
+        g[w].reports = report_mapping.first;
+        g[w].min_offset = g[v].min_offset;
+        g[w].max_offset = g[v].max_offset;
+        RoseEdge e = add_edge(v, w, g).first;
+        g[e].minBound = 0;
+        g[e].maxBound = 0;
+        g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
+        DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+    }
 }
 
 static
@@ -686,8 +754,20 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
         RoseVertex u = pv.first;
         const RoseInEdgeProps &edge_props = bd.ig[pv.second];
 
+        /* We need to duplicate the parent vertices if:
+         *
+         * 1) It already has a suffix, etc as we are going to add the specified
+         * suffix, etc to the parents and we do not want to overwrite the
+         * existing information.
+         *
+         * 2) We are making the an EOD accept and the vertex already has other
+         * out-edges - The LAST_BYTE history used for EOD accepts is
+         * incompatible with normal successors. As accepts are processed last we
+         * do not need to worry about other normal successors being added later.
+         */
         if (g[u].suffix || !g[u].reports.empty()
-             /* also poss accept eod edge: TODO check properly */
+            || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g)
+                && !edge_props.graph)
             || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) {
             DEBUG_PRINTF("duplicating for parent %zu\n", g[u].idx);
             assert(!tbi->isAnyStart(u));
@@ -719,74 +799,37 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
             }
         } else {
             assert(ig[iv].type == RIV_ACCEPT_EOD);
+            assert(!edge_props.haig);
 
-            if (edge_props.graph && tbi->isInETable(u)) {
+            if (!edge_props.graph) {
+                RoseVertex w = add_vertex(g);
+                g[w].idx = tbi->vertexIndex++;
+                g[w].eod_accept = true;
+                g[w].reports = ig[iv].reports;
+                g[w].min_offset = g[u].min_offset;
+                g[w].max_offset = g[u].max_offset;
+                RoseEdge e = add_edge(u, w, g).first;
+                g[e].minBound = 0;
+                g[e].maxBound = 0;
+                g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
+                DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+                continue;
+            }
+
+            const NGHolder &h = *edge_props.graph;
+            assert(!in_degree(h.accept, h));
+            assert(generates_callbacks(h));
+
+            if (tbi->isInETable(u)) {
+                assert(h.kind == NFA_SUFFIX);
                 assert(!tbi->isAnyStart(u));
                 /* etable can't/shouldn't use eod event */
                 DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx);
                 g[u].suffix.graph = edge_props.graph;
-                assert(g[u].suffix.graph->kind == NFA_SUFFIX);
-                dumpHolder(*g[u].suffix.graph, 98, "eod_suffix", tbi->cc.grey);
-                assert(!in_degree(g[u].suffix.graph->accept,
-                                  *g[u].suffix.graph));
-                set<ReportID> reports = all_reports(*g[u].suffix.graph);
-                tbi->rm.getReport(*reports.begin());
-                assert(reports.size() == 1);
-                /* TODO: set dfa_(min|max)_width */
                 continue;
-            } else if (edge_props.graph) {
-                assert(!edge_props.haig);
-                assert(!tbi->isInETable(u));
-
-                // Allocate the EOD event if it hasn't been already.
-                if (tbi->eod_event_literal_id == MO_INVALID_IDX) {
-                    tbi->eod_event_literal_id =
-                        tbi->getLiteralId(ue2_literal(), 0, ROSE_EVENT);
-                }
-
-                RoseVertex v = add_vertex(g);
-                g[v].idx = tbi->vertexIndex++;
-                g[v].literals.insert(tbi->eod_event_literal_id);
-                tbi->literal_info[tbi->eod_event_literal_id].vertices.insert(v);
-
-                ReportID prefix_report = tbi->getNewNfaReport();
-                g[v].left.graph
-                    = makeRoseEodPrefix(*edge_props.graph, prefix_report);
-                g[v].left.leftfix_report = prefix_report;
-                g[v].left.lag = 0;
-                RoseEdge e1 = add_edge(u, v, g).first;
-                g[e1].minBound = 0;
-                g[e1].maxBound = ROSE_BOUND_INF;
-                g[v].min_offset = add_rose_depth(
-                        g[u].min_offset, findMinWidth(*g[v].left.graph));
-                g[v].max_offset = ROSE_BOUND_INF;
-
-                DEBUG_PRINTF("hi\n");
-                depth max_width = findMaxWidth(*g[v].left.graph);
-                if (u != tbi->root
-                    && max_width.is_finite()
-                    && (!tbi->isAnyStart(u)
-                        || isPureAnchored(*g[v].left.graph))) {
-                    g[e1].maxBound = max_width;
-                    g[v].max_offset = add_rose_depth(g[u].max_offset, max_width);
-                }
-
-                g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
-                u = v;
             }
-            assert(!edge_props.haig);
 
-            RoseVertex w = add_vertex(g);
-            g[w].idx = tbi->vertexIndex++;
-            g[w].eod_accept = true;
-            g[w].reports = ig[iv].reports;
-            g[w].min_offset = g[u].min_offset;
-            g[w].max_offset = g[u].max_offset;
-            RoseEdge e = add_edge(u, w, g).first;
-            g[e].minBound = 0;
-            g[e].maxBound = 0;
-            g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
-            DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+            makeEodEventLeftfix(*tbi, u, h);
         }
     }
 }
@@ -887,7 +930,8 @@ bool suitableForEod(const RoseInGraph &ig, vector<RoseInVertex> topo,
             ENSURE_AT_LEAST(&v_depth, (u32)max_width);
         }
 
-        if (v_depth == ROSE_BOUND_INF || v_depth > cc.grey.maxHistoryAvailable) {
+        if (v_depth == ROSE_BOUND_INF
+            || v_depth > cc.grey.maxHistoryAvailable) {
             DEBUG_PRINTF("not suitable for eod table %u\n", v_depth);
             return false;
         }
@@ -900,6 +944,13 @@ bool suitableForEod(const RoseInGraph &ig, vector<RoseInVertex> topo,
     return true;
 }
 
+static
+void shift_accepts_to_end(const RoseInGraph &ig,
+                          vector<RoseInVertex> &topo_order) {
+    stable_partition(begin(topo_order), end(topo_order),
+                     [&](RoseInVertex v){ return !is_any_accept(v, ig); });
+}
+
 static
 void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
     const RoseInGraph &ig = bd.ig;
@@ -912,6 +963,7 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
     map<RoseInVertex, vector<RoseVertex> > vertex_map;
 
     vector<RoseInVertex> v_order = topo_order(ig);
+    shift_accepts_to_end(ig, v_order);
 
     u32 eod_space_required;
     bool use_eod_table = suitableForEod(ig, v_order, &eod_space_required,
@@ -943,7 +995,7 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
             const vector<RoseVertex> &images = vertex_map[u];
 
             // We should have no dupes.
-            assert(set<NFAVertex>(images.begin(), images.end()).size()
+            assert(set<RoseVertex>(images.begin(), images.end()).size()
                    == images.size());
 
             for (auto v_image : images) {
@@ -1038,6 +1090,7 @@ bool canImplementGraph(RoseBuildImpl *tbi, const RoseInGraph &in, NGHolder &h,
                 return false;
             }
             break;
+        case NFA_EAGER_PREFIX:
         case NFA_REV_PREFIX:
         case NFA_OUTFIX_RAW:
             DEBUG_PRINTF("kind %u\n", (u32)h.kind);
@@ -1133,7 +1186,7 @@ u32 maxAvailableDelay(const ue2_literal &pred_key, const ue2_literal &lit_key) {
 }
 
 static
-u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseVertex v) {
+u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseInVertex v) {
     // First, check the overlap constraints on (u,v).
     size_t max_delay;
     if (ig[v].type == RIV_LITERAL) {
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index d8eb939a..45333a38 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -336,7 +336,8 @@ void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk,
 }
 
 static
-bool validateTransientMask(const vector<CharReach> &mask, bool eod, const Grey &grey) {
+bool validateTransientMask(const vector<CharReach> &mask, bool anchored,
+                           bool eod, const Grey &grey) {
     assert(!mask.empty());
 
     // An EOD anchored mask requires that everything fit into history, while an
@@ -348,6 +349,12 @@ bool validateTransientMask(const vector<CharReach> &mask, bool eod, const Grey &
         return false;
     }
 
+    /* although anchored masks cannot be transient, short masks may be placed
+     * into the atable. */
+    if (anchored && mask.size() > grey.maxAnchoredRegion) {
+        return false;
+    }
+
     vector<ue2_literal> lits;
     u32 lit_minBound; /* minBound of each literal in lit */
     u32 lit_length;   /* length of each literal in lit */
@@ -703,7 +710,7 @@ bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit,
 
 bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
                         const ue2::flat_set<ReportID> &reports) {
-    if (validateTransientMask(mask, false, cc.grey)) {
+    if (validateTransientMask(mask, anchored, false, cc.grey)) {
         bool eod = false;
         addTransientMask(*this, mask, reports, anchored, eod);
         return true;
@@ -726,8 +733,8 @@ bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
 
 bool RoseBuildImpl::validateMask(const vector<CharReach> &mask,
                                  UNUSED const ue2::flat_set<ReportID> &reports,
-                                 UNUSED bool anchored, bool eod) const {
-    return validateTransientMask(mask, eod, cc.grey);
+                                 bool anchored, bool eod) const {
+    return validateTransientMask(mask, anchored, eod, cc.grey);
 }
 
 static
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 35ff7138..60732ff9 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -173,34 +173,54 @@ void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
 }
 
 static
-void translateReportSet(flat_set<ReportID> *rset, const RoseBuildImpl &tbi) {
-    flat_set<ReportID> old;
-    old.swap(*rset);
-    for (auto report_id : old) {
-        DEBUG_PRINTF("updating %u -> %u\n", report_id,
-                     tbi.literal_info[report_id].final_id);
-        rset->insert(tbi.literal_info[report_id].final_id);
-    }
-}
-
-static
-void remapAnchoredReports(raw_dfa &dfa, const RoseBuildImpl &tbi) {
-    for (dstate &ds : dfa.states) {
-        translateReportSet(&ds.reports, tbi);
-        translateReportSet(&ds.reports_eod, tbi);
-    }
-}
-
-/* Replaces the report ids currently in the dfas (rose graph literal ids) with
- * the final id used by the runtime. */
-static
-void remapAnchoredReports(RoseBuildImpl &tbi) {
-    for (auto it = tbi.anchored_nfas.begin(); it != tbi.anchored_nfas.end();
-         ++it) {
-        for (auto &rdfa : it->second) {
-            assert(rdfa);
-            remapAnchoredReports(*rdfa, tbi);
+void remapAnchoredReports(raw_dfa &rdfa, const RoseBuildImpl &build) {
+    for (dstate &ds : rdfa.states) {
+        assert(ds.reports_eod.empty()); // Not used in anchored matcher.
+        if (ds.reports.empty()) {
+            continue;
         }
+
+        flat_set<ReportID> new_reports;
+        for (auto id : ds.reports) {
+            assert(id < build.literal_info.size());
+            new_reports.insert(build.literal_info.at(id).final_id);
+        }
+        ds.reports = move(new_reports);
+    }
+}
+
+/**
+ * \brief Replaces the report ids currently in the dfas (rose graph literal
+ * ids) with the final id for each literal.
+ */
+static
+void remapAnchoredReports(RoseBuildImpl &build) {
+    for (auto &m : build.anchored_nfas) {
+        for (auto &rdfa : m.second) {
+            assert(rdfa);
+            remapAnchoredReports(*rdfa, build);
+        }
+    }
+}
+
+/**
+ * \brief Replace the reports (which are literal final_ids) in the given
+ * raw_dfa with program offsets.
+ */
+static
+void remapIdsToPrograms(raw_dfa &rdfa, const vector<u32> &litPrograms) {
+    for (dstate &ds : rdfa.states) {
+        assert(ds.reports_eod.empty()); // Not used in anchored matcher.
+        if (ds.reports.empty()) {
+            continue;
+        }
+
+        flat_set<ReportID> new_reports;
+        for (auto id : ds.reports) {
+            assert(id < litPrograms.size());
+            new_reports.insert(litPrograms.at(id));
+        }
+        ds.reports = move(new_reports);
     }
 }
 
@@ -476,7 +496,7 @@ NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
     }
 
     if (lit_verts.empty()) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     bool nocase = false;
@@ -488,7 +508,7 @@ NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
         if (cr.isAlpha()) {
             bool cr_nocase = cr.count() != 1;
             if (case_set && cr_nocase != nocase) {
-                return NFAGraph::null_vertex();
+                return NGHolder::null_vertex();
             }
 
             case_set = true;
@@ -511,7 +531,7 @@ bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
     DEBUG_PRINTF("looking for simple case\n");
     NFAVertex lit_head = extractLiteral(h, lit);
 
-    if (lit_head == NFAGraph::null_vertex()) {
+    if (lit_head == NGHolder::null_vertex()) {
         DEBUG_PRINTF("no literal found\n");
         return false;
     }
@@ -826,7 +846,7 @@ vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build) {
 
 aligned_unique_ptr<anchored_matcher_info>
 buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
-                     size_t *asize) {
+                     const vector<u32> &litPrograms, size_t *asize) {
     const CompileContext &cc = build.cc;
 
     if (dfas.empty()) {
@@ -835,6 +855,10 @@ buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
         return nullptr;
     }
 
+    for (auto &rdfa : dfas) {
+        remapIdsToPrograms(rdfa, litPrograms);
+    }
+
     vector<aligned_unique_ptr<NFA>> nfas;
     vector<u32> start_offset; // start offset for each dfa (dots removed)
     size_t total_size = buildNfas(dfas, &nfas, &start_offset, cc, build.rm);
diff --git a/src/rose/rose_build_anchored.h b/src/rose/rose_build_anchored.h
index a5317f89..ef06fcbb 100644
--- a/src/rose/rose_build_anchored.h
+++ b/src/rose/rose_build_anchored.h
@@ -39,13 +39,10 @@
 #include <set>
 
 struct anchored_matcher_info;
-struct RoseEngine;
 
 namespace ue2 {
 
-class NGHolder;
 class RoseBuildImpl;
-struct Grey;
 struct raw_dfa;
 
 /**
@@ -56,10 +53,13 @@ std::vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build);
 /**
  * \brief Construct an anchored_matcher_info runtime structure from the given
  * set of DFAs.
+ *
+ * Remap the literal final_ids used for raw_dfa reports to the program offsets
+ * given in litPrograms.
  */
 aligned_unique_ptr<anchored_matcher_info>
 buildAnchoredMatcher(RoseBuildImpl &build, std::vector<raw_dfa> &dfas,
-                     size_t *asize);
+                     const std::vector<u32> &litPrograms, size_t *asize);
 
 u32 anchoredStateSize(const anchored_matcher_info &atable);
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 758dd442..56591de8 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -33,6 +33,8 @@
 #include "hs_compile.h" // for HS_MODE_*
 #include "rose_build_add_internal.h"
 #include "rose_build_anchored.h"
+#include "rose_build_exclusive.h"
+#include "rose_build_groups.h"
 #include "rose_build_infix.h"
 #include "rose_build_lookaround.h"
 #include "rose_build_matchers.h"
@@ -48,7 +50,11 @@
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
+#include "nfa/shengcompile.h"
 #include "nfa/shufticompile.h"
+#include "nfa/tamaramacompile.h"
+#include "nfa/tamarama_internal.h"
+#include "nfagraph/ng_execute.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_lbr.h"
 #include "nfagraph/ng_limex.h"
@@ -59,6 +65,7 @@
 #include "nfagraph/ng_stop.h"
 #include "nfagraph/ng_util.h"
 #include "nfagraph/ng_width.h"
+#include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
@@ -69,8 +76,10 @@
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/graph_range.h"
+#include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/order_check.h"
+#include "util/popcount.h"
 #include "util/queue_index_factory.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -191,10 +200,13 @@ public:
         case ROSE_INSTR_CHECK_BOUNDS: return &u.checkBounds;
         case ROSE_INSTR_CHECK_NOT_HANDLED: return &u.checkNotHandled;
         case ROSE_INSTR_CHECK_LOOKAROUND: return &u.checkLookaround;
+        case ROSE_INSTR_CHECK_MASK: return &u.checkMask;
+        case ROSE_INSTR_CHECK_BYTE: return &u.checkByte;
         case ROSE_INSTR_CHECK_INFIX: return &u.checkInfix;
         case ROSE_INSTR_CHECK_PREFIX: return &u.checkPrefix;
         case ROSE_INSTR_ANCHORED_DELAY: return &u.anchoredDelay;
         case ROSE_INSTR_PUSH_DELAYED: return &u.pushDelayed;
+        case ROSE_INSTR_RECORD_ANCHORED: return &u.recordAnchored;
         case ROSE_INSTR_CATCH_UP: return &u.catchUp;
         case ROSE_INSTR_CATCH_UP_MPV: return &u.catchUpMpv;
         case ROSE_INSTR_SOM_ADJUST: return &u.somAdjust;
@@ -222,6 +234,9 @@ public:
         case ROSE_INSTR_CHECK_STATE: return &u.checkState;
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return &u.sparseIterBegin;
         case ROSE_INSTR_SPARSE_ITER_NEXT: return &u.sparseIterNext;
+        case ROSE_INSTR_ENGINES_EOD: return &u.enginesEod;
+        case ROSE_INSTR_SUFFIXES_EOD: return &u.suffixesEod;
+        case ROSE_INSTR_MATCHER_EOD: return &u.matcherEod;
         case ROSE_INSTR_END: return &u.end;
         }
         assert(0);
@@ -237,10 +252,13 @@ public:
         case ROSE_INSTR_CHECK_BOUNDS: return sizeof(u.checkBounds);
         case ROSE_INSTR_CHECK_NOT_HANDLED: return sizeof(u.checkNotHandled);
         case ROSE_INSTR_CHECK_LOOKAROUND: return sizeof(u.checkLookaround);
+        case ROSE_INSTR_CHECK_MASK: return sizeof(u.checkMask);
+        case ROSE_INSTR_CHECK_BYTE: return sizeof(u.checkByte);
         case ROSE_INSTR_CHECK_INFIX: return sizeof(u.checkInfix);
         case ROSE_INSTR_CHECK_PREFIX: return sizeof(u.checkPrefix);
         case ROSE_INSTR_ANCHORED_DELAY: return sizeof(u.anchoredDelay);
         case ROSE_INSTR_PUSH_DELAYED: return sizeof(u.pushDelayed);
+        case ROSE_INSTR_RECORD_ANCHORED: return sizeof(u.recordAnchored);
         case ROSE_INSTR_CATCH_UP: return sizeof(u.catchUp);
         case ROSE_INSTR_CATCH_UP_MPV: return sizeof(u.catchUpMpv);
         case ROSE_INSTR_SOM_ADJUST: return sizeof(u.somAdjust);
@@ -268,6 +286,9 @@ public:
         case ROSE_INSTR_CHECK_STATE: return sizeof(u.checkState);
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return sizeof(u.sparseIterBegin);
         case ROSE_INSTR_SPARSE_ITER_NEXT: return sizeof(u.sparseIterNext);
+        case ROSE_INSTR_ENGINES_EOD: return sizeof(u.enginesEod);
+        case ROSE_INSTR_SUFFIXES_EOD: return sizeof(u.suffixesEod);
+        case ROSE_INSTR_MATCHER_EOD: return sizeof(u.matcherEod);
         case ROSE_INSTR_END: return sizeof(u.end);
         }
         assert(0);
@@ -282,10 +303,13 @@ public:
         ROSE_STRUCT_CHECK_BOUNDS checkBounds;
         ROSE_STRUCT_CHECK_NOT_HANDLED checkNotHandled;
         ROSE_STRUCT_CHECK_LOOKAROUND checkLookaround;
+        ROSE_STRUCT_CHECK_MASK checkMask;
+        ROSE_STRUCT_CHECK_BYTE checkByte;
         ROSE_STRUCT_CHECK_INFIX checkInfix;
         ROSE_STRUCT_CHECK_PREFIX checkPrefix;
         ROSE_STRUCT_ANCHORED_DELAY anchoredDelay;
         ROSE_STRUCT_PUSH_DELAYED pushDelayed;
+        ROSE_STRUCT_RECORD_ANCHORED recordAnchored;
         ROSE_STRUCT_CATCH_UP catchUp;
         ROSE_STRUCT_CATCH_UP_MPV catchUpMpv;
         ROSE_STRUCT_SOM_ADJUST somAdjust;
@@ -313,6 +337,9 @@ public:
         ROSE_STRUCT_CHECK_STATE checkState;
         ROSE_STRUCT_SPARSE_ITER_BEGIN sparseIterBegin;
         ROSE_STRUCT_SPARSE_ITER_NEXT sparseIterNext;
+        ROSE_STRUCT_ENGINES_EOD enginesEod;
+        ROSE_STRUCT_SUFFIXES_EOD suffixesEod;
+        ROSE_STRUCT_MATCHER_EOD matcherEod;
         ROSE_STRUCT_END end;
     } u;
 
@@ -349,6 +376,7 @@ struct RoseResources {
     bool has_lit_delay = false;
     bool has_lit_mask = false;
     bool has_anchored = false;
+    bool has_eod = false;
 };
 
 struct build_context : boost::noncopyable {
@@ -391,6 +419,10 @@ struct build_context : boost::noncopyable {
      * that have already been pushed into the engine_blob. */
     ue2::unordered_map<u32, u32> engineOffsets;
 
+    /** \brief Literal programs, indexed by final_id, after they have been
+     * written to the engine_blob. */
+    vector<u32> litPrograms;
+
     /** \brief Minimum offset of a match from the floating table. */
     u32 floatingMinLiteralMatchOffset = 0;
 
@@ -408,6 +440,13 @@ struct build_context : boost::noncopyable {
     /** \brief Resources in use (tracked as programs are added). */
     RoseResources resources;
 
+    /** \brief Mapping from every vertex to the groups that must be on for that
+     * vertex to be reached. */
+    ue2::unordered_map<RoseVertex, rose_group> vertex_group_map;
+
+    /** \brief Global bitmap of groups that can be squashed. */
+    rose_group squashable_groups = 0;
+
     /** \brief Base offset of engine_blob in the Rose engine bytecode. */
     static constexpr u32 engine_blob_base = ROUNDUP_CL(sizeof(RoseEngine));
 };
@@ -460,7 +499,7 @@ u32 add_to_engine_blob(build_context &bc, const T &a, const size_t len) {
 template<typename Iter>
 static
 u32 add_to_engine_blob(build_context &bc, Iter b, const Iter &e) {
-    using value_type = typename Iter::value_type;
+    using value_type = typename std::iterator_traits<Iter>::value_type;
     static_assert(is_pod<value_type>::value, "should be pod");
 
     if (b == e) {
@@ -567,6 +606,11 @@ bool isPureFloating(const RoseResources &resources) {
         return false;
     }
 
+    if (resources.has_eod) {
+        DEBUG_PRINTF("has eod work to do\n");
+        return false;
+    }
+
     if (resources.has_states) {
         DEBUG_PRINTF("has states\n");
         return false;
@@ -622,6 +666,7 @@ u8 pickRuntimeImpl(const RoseBuildImpl &build, const build_context &bc,
     DEBUG_PRINTF("has_lit_delay=%d\n", bc.resources.has_lit_delay);
     DEBUG_PRINTF("has_lit_mask=%d\n", bc.resources.has_lit_mask);
     DEBUG_PRINTF("has_anchored=%d\n", bc.resources.has_anchored);
+    DEBUG_PRINTF("has_eod=%d\n", bc.resources.has_eod);
 
     if (isPureFloating(bc.resources)) {
         return ROSE_RUNTIME_PURE_LITERAL;
@@ -819,13 +864,18 @@ aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
                                  aligned_unique_ptr<NFA> nfa_impl) {
     assert(nfa_impl);
     assert(dfa_impl);
-    assert(isMcClellanType(dfa_impl->type));
+    assert(isDfaType(dfa_impl->type));
 
     // If our NFA is an LBR, it always wins.
     if (isLbrType(nfa_impl->type)) {
         return nfa_impl;
     }
 
+    // if our DFA is an accelerated Sheng, it always wins.
+    if (isShengType(dfa_impl->type) && has_accel(*dfa_impl)) {
+        return dfa_impl;
+    }
+
     bool d_accel = has_accel(*dfa_impl);
     bool n_accel = has_accel(*nfa_impl);
     bool d_big = dfa_impl->type == MCCLELLAN_NFA_16;
@@ -878,6 +928,18 @@ buildRepeatEngine(const CastleProto &proto,
     return castle_nfa;
 }
 
+static
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+                               const ReportManager &rm) {
+    // Unleash the Sheng!!
+    auto dfa = shengCompile(rdfa, cc, rm);
+    if (!dfa) {
+        // Sheng wasn't successful, so unleash McClellan!
+        dfa = mcclellanCompile(rdfa, cc, rm);
+    }
+    return dfa;
+}
+
 /* builds suffix nfas */
 static
 aligned_unique_ptr<NFA>
@@ -898,7 +960,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
     }
 
     if (suff.dfa()) {
-        auto d = mcclellanCompile(*suff.dfa(), cc, rm);
+        auto d = getDfa(*suff.dfa(), cc, rm);
         assert(d);
         return d;
     }
@@ -927,7 +989,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
-                auto d = mcclellanCompile(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
                     n = pickImpl(move(d), move(n));
@@ -1022,8 +1084,9 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     // streaming mode.
     const bool compress_state = !is_transient;
 
-    assert(!left.graph()
-           || left.graph()->kind == (is_prefix ? NFA_PREFIX : NFA_INFIX));
+    assert(is_prefix || !left.graph() || left.graph()->kind == NFA_INFIX);
+    assert(!is_prefix || !left.graph() || left.graph()->kind == NFA_PREFIX
+           || left.graph()->kind == NFA_EAGER_PREFIX);
 
     // Holder should be implementable as an NFA at the very least.
     if (!left.dfa() && left.graph()) {
@@ -1046,12 +1109,13 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     }
 
     if (left.dfa()) {
-        n = mcclellanCompile(*left.dfa(), cc, rm);
+        n = getDfa(*left.dfa(), cc, rm);
     } else if (left.graph() && cc.grey.roseMcClellanPrefix == 2 && is_prefix &&
                !is_transient) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            n = mcclellanCompile(*rdfa, cc, rm);
+            n = getDfa(*rdfa, cc, rm);
+            assert(n);
         }
     }
 
@@ -1065,7 +1129,9 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
 
     if (!n && left.graph()) {
         map<u32, vector<vector<CharReach>>> triggers;
-        findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
+        if (left.graph()->kind == NFA_INFIX) {
+            findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
+        }
         n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers,
                          compress_state, cc);
     }
@@ -1075,7 +1141,7 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            auto d = mcclellanCompile(*rdfa, cc, rm);
+            auto d = getDfa(*rdfa, cc, rm);
             assert(d);
             n = pickImpl(move(d), move(n));
         }
@@ -1102,23 +1168,612 @@ void setLeftNfaProperties(NFA &n, const left_id &left) {
 }
 
 static
-bool buildLeftfixes(const RoseBuildImpl &tbi, build_context &bc,
-                    QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
-                    bool do_prefix) {
-    const RoseGraph &g = tbi.g;
-    const CompileContext &cc = tbi.cc;
-    const ReportManager &rm = tbi.rm;
+void appendTailToHolder(NGHolder &h, const flat_set<ReportID> &reports,
+                        const vector<NFAVertex> &starts,
+                        const vector<CharReach> &tail) {
+    assert(!tail.empty());
+    NFAVertex curr = add_vertex(h);
+    for (NFAVertex v : starts) {
+        assert(!edge(v, h.acceptEod, h).second);
+        assert(h[v].reports == reports);
+        h[v].reports.clear();
+        remove_edge(v, h.accept, h);
+        add_edge(v, curr, h);
+    }
+    auto it = tail.begin();
+    h[curr].char_reach = *it;
+    ++it;
+    while (it != tail.end()) {
+        NFAVertex old = curr;
+        curr = add_vertex(h);
+        add_edge(old, curr, h);
+        assert(!it->none());
+        h[curr].char_reach = *it;
+        ++it;
+    }
 
-    ue2::unordered_map<left_id, u32> seen; // already built queue indices
+    h[curr].reports = reports;
+    add_edge(curr, h.accept, h);
+}
 
-    map<left_id, set<PredTopPair> > infixTriggers;
-    findInfixTriggers(tbi, &infixTriggers);
+static
+void appendTailToHolder(NGHolder &h, const vector<CharReach> &tail) {
+    assert(in_degree(h.acceptEod, h) == 1);
+    assert(!tail.empty());
 
+    map<flat_set<ReportID>, vector<NFAVertex> > reporters;
+    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
+        reporters[h[v].reports].push_back(v);
+    }
+
+    for (const auto &e : reporters) {
+        appendTailToHolder(h, e.first, e.second, tail);
+    }
+
+    h.renumberEdges();
+}
+
+static
+u32 decreaseLag(const RoseBuildImpl &build, NGHolder &h,
+                const vector<RoseVertex> &succs) {
+    const RoseGraph &rg = build.g;
+    static const size_t MAX_RESTORE_LEN = 5;
+
+    vector<CharReach> restored(MAX_RESTORE_LEN);
+    for (RoseVertex v : succs) {
+        u32 lag = rg[v].left.lag;
+        for (u32 lit_id : rg[v].literals) {
+            u32 delay = build.literals.right.at(lit_id).delay;
+            const ue2_literal &literal = build.literals.right.at(lit_id).s;
+            assert(lag <= literal.length() + delay);
+            size_t base = literal.length() + delay - lag;
+            if (base >= literal.length()) {
+                return 0;
+            }
+            size_t len = literal.length() - base;
+            len = MIN(len, restored.size());
+            restored.resize(len);
+            auto lit_it = literal.begin() + base;
+            for (u32 i = 0; i < len; i++) {
+                assert(lit_it != literal.end());
+                restored[i] |= *lit_it;
+                ++lit_it;
+            }
+        }
+    }
+
+    assert(!restored.empty());
+
+    appendTailToHolder(h, restored);
+
+    return restored.size();
+}
+
+#define EAGER_DIE_BEFORE_LIMIT 10
+
+struct eager_info {
+    shared_ptr<NGHolder> new_graph;
+    u32 lag_adjust = 0;
+};
+
+static
+bool checkSuitableForEager(bool is_prefix, const left_id &left,
+                           const RoseBuildImpl &build,
+                           const vector<RoseVertex> &succs,
+                           rose_group squash_mask, rose_group initial_groups,
+                           eager_info &ei, const CompileContext &cc) {
+    DEBUG_PRINTF("checking prefix --> %016llx...\n", squash_mask);
+
+    const RoseGraph &rg = build.g;
+
+    if (!is_prefix) {
+        DEBUG_PRINTF("not prefix\n");
+        return false; /* only prefixes (for now...) */
+    }
+
+    if ((initial_groups & squash_mask) == initial_groups) {
+        DEBUG_PRINTF("no squash -- useless\n");
+        return false;
+    }
+
+    for (RoseVertex s : succs) {
+        if (build.isInETable(s)
+            || contains(rg[s].literals, build.eod_event_literal_id)) {
+            return false; /* Ignore EOD related prefixes */
+        }
+    }
+
+    if (left.dfa()) {
+        const raw_dfa &dfa = *left.dfa();
+        if (dfa.start_floating != DEAD_STATE) {
+            return false; /* not purely anchored */
+        }
+        if (!dfa.states[dfa.start_anchored].reports.empty()) {
+            return false; /* vacuous (todo: handle?) */
+        }
+
+        if (!can_die_early(dfa, EAGER_DIE_BEFORE_LIMIT)) {
+            return false;
+        }
+        ei.new_graph = rg[succs[0]].left.graph;
+    } else if (left.graph()) {
+        const NGHolder &g = *left.graph();
+        if (proper_out_degree(g.startDs, g)) {
+            return false; /* not purely anchored */
+        }
+
+        ei.new_graph = cloneHolder(*left.graph());
+        auto gg = ei.new_graph;
+        gg->kind = NFA_EAGER_PREFIX;
+
+        ei.lag_adjust = decreaseLag(build, *gg, succs);
+
+        if (is_match_vertex(gg->start, *gg)) {
+            return false; /* should not still be vacuous as lag decreased */
+        }
+
+        if (!can_die_early(*gg, EAGER_DIE_BEFORE_LIMIT)) {
+            DEBUG_PRINTF("not eager as stuck alive\n");
+            return false;
+        }
+
+        /* We need to ensure that adding in the literals does not cause us to no
+         * longer be able to build an nfa. */
+        bool ok = isImplementableNFA(*gg, nullptr, cc);
+        if (!ok) {
+            return false;
+        }
+    } else {
+        DEBUG_PRINTF("unable to determine if good for eager running\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("eager prefix\n");
+    return true;
+}
+
+static
+left_id updateLeftfixWithEager(RoseGraph &g, const eager_info &ei,
+                               const vector<RoseVertex> &succs) {
+    u32 lag_adjust = ei.lag_adjust;
+    auto gg = ei.new_graph;
+    for (RoseVertex v : succs) {
+        g[v].left.graph = gg;
+        assert(g[v].left.lag >= lag_adjust);
+        g[v].left.lag -= lag_adjust;
+        DEBUG_PRINTF("added %u literal chars back, new lag %u\n", lag_adjust,
+                     g[v].left.lag);
+    }
+    left_id leftfix = g[succs[0]].left;
+
+    if (leftfix.graph()) {
+        assert(leftfix.graph()->kind == NFA_PREFIX
+               || leftfix.graph()->kind == NFA_EAGER_PREFIX);
+        leftfix.graph()->kind = NFA_EAGER_PREFIX;
+    }
+    if (leftfix.dfa()) {
+        assert(leftfix.dfa()->kind == NFA_PREFIX);
+        leftfix.dfa()->kind = NFA_EAGER_PREFIX;
+    }
+
+    return leftfix;
+}
+
+static
+bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
+                  const map<left_id, set<PredTopPair> > &infixTriggers,
+                  set<u32> *no_retrigger_queues, set<u32> *eager_queues,
+                  const map<left_id, eager_info> &eager,
+                  const vector<RoseVertex> &succs, left_id leftfix) {
+    RoseGraph &g = build.g;
+    const CompileContext &cc = build.cc;
+    const ReportManager &rm = build.rm;
+
+    bool is_transient = contains(build.transient, leftfix);
+    rose_group squash_mask = build.rose_squash_masks.at(leftfix);
+
+    DEBUG_PRINTF("making %sleftfix\n", is_transient ? "transient " : "");
+
+    if (contains(eager, leftfix)) {
+        eager_queues->insert(qi);
+        leftfix = updateLeftfixWithEager(g, eager.at(leftfix), succs);
+    }
+
+    aligned_unique_ptr<NFA> nfa;
+    // Need to build NFA, which is either predestined to be a Haig (in SOM mode)
+    // or could be all manner of things.
+    if (leftfix.haig()) {
+        nfa = goughCompile(*leftfix.haig(), build.ssm.somPrecision(), cc, rm);
+    }  else {
+        nfa = makeLeftNfa(build, leftfix, prefix, is_transient, infixTriggers,
+                          cc);
+    }
+
+    if (!nfa) {
+        assert(!"failed to build leftfix");
+        return false;
+    }
+
+    setLeftNfaProperties(*nfa, leftfix);
+
+    build.leftfix_queue_map.emplace(leftfix, qi);
+    nfa->queueIndex = qi;
+
+    if (!prefix && !leftfix.haig() && leftfix.graph()
+        && nfaStuckOn(*leftfix.graph())) {
+        DEBUG_PRINTF("%u sticks on\n", qi);
+        no_retrigger_queues->insert(qi);
+    }
+
+    DEBUG_PRINTF("built leftfix, qi=%u\n", qi);
+    add_nfa_to_blob(bc, *nfa);
+
+    // Leftfixes can have stop alphabets.
+    vector<u8> stop(N_CHARS, 0);
+    /* haigs track som information - need more care */
+    som_type som = leftfix.haig() ? SOM_LEFT : SOM_NONE;
+    if (leftfix.graph()) {
+        stop = findLeftOffsetStopAlphabet(*leftfix.graph(), som);
+    } else if (leftfix.castle()) {
+        stop = findLeftOffsetStopAlphabet(*leftfix.castle(), som);
+    }
+
+    // Infix NFAs can have bounds on their queue lengths.
+    u32 max_queuelen = UINT32_MAX;
+    if (!prefix) {
+        set<ue2_literal> lits;
+        for (RoseVertex v : succs) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                for (u32 lit_id : g[u].literals) {
+                    lits.insert(build.literals.right.at(lit_id).s);
+                }
+            }
+        }
+        DEBUG_PRINTF("%zu literals\n", lits.size());
+        max_queuelen = findMaxInfixMatches(leftfix, lits);
+        if (max_queuelen < UINT32_MAX) {
+            max_queuelen++;
+        }
+    }
+
+    u32 max_width;
+    if (is_transient) {
+        depth d = findMaxWidth(leftfix);
+        assert(d.is_finite());
+        max_width = d;
+    } else {
+        max_width = 0;
+    }
+
+    u8 cm_count = 0;
+    CharReach cm_cr;
+    if (cc.grey.allowCountingMiracles) {
+        findCountingMiracleInfo(leftfix, stop, &cm_count, &cm_cr);
+    }
+
+    for (RoseVertex v : succs) {
+        bc.leftfix_info.emplace(v, left_build_info(qi, g[v].left.lag, max_width,
+                                                   squash_mask, stop,
+                                                   max_queuelen, cm_count,
+                                                   cm_cr));
+    }
+
+    return true;
+}
+
+static
+unique_ptr<TamaInfo> constructTamaInfo(const RoseGraph &g,
+                     const vector<ExclusiveSubengine> &subengines,
+                     const bool is_suffix) {
+    unique_ptr<TamaInfo> tamaInfo = ue2::make_unique<TamaInfo>();
+    for (const auto &sub : subengines) {
+        const auto &rose_vertices = sub.vertices;
+        NFA *nfa = sub.nfa.get();
+        set<u32> tops;
+        for (const auto &v : rose_vertices) {
+            if (is_suffix) {
+                tops.insert(g[v].suffix.top);
+            } else {
+                for (const auto &e : in_edges_range(v, g)) {
+                    tops.insert(g[e].rose_top);
+                }
+            }
+        }
+        tamaInfo->add(nfa, tops);
+    }
+
+    return tamaInfo;
+}
+
+static
+void updateTops(const RoseGraph &g, const TamaInfo &tamaInfo,
+                TamaProto &tamaProto,
+                const vector<ExclusiveSubengine> &subengines,
+                const map<pair<const NFA *, u32>, u32> &out_top_remap,
+                const bool is_suffix) {
+    u32 i = 0;
+    for (const auto &n : tamaInfo.subengines) {
+        for (const auto &v : subengines[i].vertices) {
+            if (is_suffix) {
+                tamaProto.add(n, g[v].idx, g[v].suffix.top,
+                              out_top_remap);
+            } else {
+                for (const auto &e : in_edges_range(v, g)) {
+                    tamaProto.add(n, g[v].idx, g[e].rose_top,
+                                  out_top_remap);
+                }
+            }
+        }
+        i++;
+    }
+}
+
+static
+shared_ptr<TamaProto> constructContainerEngine(const RoseGraph &g,
+                                               build_context &bc,
+                                               const ExclusiveInfo &info,
+                                               const u32 queue,
+                                               const bool is_suffix) {
+    const auto &subengines = info.subengines;
+    auto tamaInfo =
+        constructTamaInfo(g, subengines, is_suffix);
+
+    map<pair<const NFA *, u32>, u32> out_top_remap;
+    auto n = buildTamarama(*tamaInfo, queue, out_top_remap);
+    add_nfa_to_blob(bc, *n);
+
+    DEBUG_PRINTF("queue id:%u\n", queue);
+    shared_ptr<TamaProto> tamaProto = make_shared<TamaProto>();
+    tamaProto->reports = info.reports;
+    updateTops(g, *tamaInfo, *tamaProto, subengines,
+               out_top_remap, is_suffix);
+    return tamaProto;
+}
+
+static
+void buildInfixContainer(RoseGraph &g, build_context &bc,
+                         const vector<ExclusiveInfo> &exclusive_info) {
+    // Build tamarama engine
+    for (const auto &info : exclusive_info) {
+        const u32 queue = info.queue;
+        const auto &subengines = info.subengines;
+        auto tamaProto =
+            constructContainerEngine(g, bc, info, queue, false);
+
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                g[v].left.tamarama = tamaProto;
+            }
+        }
+    }
+}
+
+static
+void buildSuffixContainer(RoseGraph &g, build_context &bc,
+                          const vector<ExclusiveInfo> &exclusive_info) {
+    // Build tamarama engine
+    for (const auto &info : exclusive_info) {
+        const u32 queue = info.queue;
+        const auto &subengines = info.subengines;
+        auto tamaProto =
+            constructContainerEngine(g, bc, info, queue, true);
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                g[v].suffix.tamarama = tamaProto;
+            }
+            const auto &v = verts[0];
+            suffix_id newSuffix(g[v].suffix);
+            bc.suffixes.emplace(newSuffix, queue);
+        }
+    }
+}
+
+static
+void updateExclusiveInfixProperties(const RoseBuildImpl &build,
+                                    build_context &bc,
+                                    const vector<ExclusiveInfo> &exclusive_info,
+                                    set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+    for (const auto &info : exclusive_info) {
+        // Set leftfix optimisations, disabled for tamarama subengines
+        rose_group squash_mask = ~rose_group{0};
+        // Leftfixes can have stop alphabets.
+        vector<u8> stop(N_CHARS, 0);
+        // Infix NFAs can have bounds on their queue lengths.
+        u32 max_queuelen = 0;
+        u32 max_width = 0;
+        u8 cm_count = 0;
+        CharReach cm_cr;
+
+        const auto &qi = info.queue;
+        const auto &subengines = info.subengines;
+        bool no_retrigger = true;
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            const auto &v_first = verts[0];
+            left_id leftfix(g[v_first].left);
+            if (leftfix.haig() || !leftfix.graph() ||
+                !nfaStuckOn(*leftfix.graph())) {
+                no_retrigger = false;
+            }
+
+            for (const auto &v : verts) {
+                set<ue2_literal> lits;
+                for (auto u : inv_adjacent_vertices_range(v, build.g)) {
+                    for (u32 lit_id : build.g[u].literals) {
+                        lits.insert(build.literals.right.at(lit_id).s);
+                    }
+                }
+                DEBUG_PRINTF("%zu literals\n", lits.size());
+
+                u32 queuelen = findMaxInfixMatches(leftfix, lits);
+                if (queuelen < UINT32_MAX) {
+                    queuelen++;
+                }
+                max_queuelen = max(max_queuelen, queuelen);
+            }
+        }
+
+        if (no_retrigger) {
+            no_retrigger_queues->insert(qi);
+        }
+
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                u32 lag = g[v].left.lag;
+                bc.leftfix_info.emplace(
+                    v, left_build_info(qi, lag, max_width, squash_mask, stop,
+                                       max_queuelen, cm_count, cm_cr));
+            }
+        }
+    }
+}
+
+static
+void updateExclusiveSuffixProperties(const RoseBuildImpl &build,
+                                const vector<ExclusiveInfo> &exclusive_info,
+                                set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+    for (auto &info : exclusive_info) {
+        const auto &qi = info.queue;
+        const auto &subengines = info.subengines;
+        bool no_retrigger = true;
+        for (const auto &sub : subengines) {
+            const auto &v_first = sub.vertices[0];
+            suffix_id suffix(g[v_first].suffix);
+            if (!suffix.graph() || !nfaStuckOn(*suffix.graph())) {
+                no_retrigger = false;
+                break;
+            }
+        }
+
+        if (no_retrigger) {
+            no_retrigger_queues->insert(qi);
+        }
+    }
+}
+
+static
+void buildExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
+                           QueueIndexFactory &qif,
+                           const map<left_id, set<PredTopPair>> &infixTriggers,
+                           const map<u32, vector<RoseVertex>> &vertex_map,
+                           const vector<vector<u32>> &groups,
+                           set<u32> *no_retrigger_queues) {
+    RoseGraph &g = build.g;
+    const CompileContext &cc = build.cc;
+
+    vector<ExclusiveInfo> exclusive_info;
+    for (const auto &gp : groups) {
+        ExclusiveInfo info;
+        for (const auto &id : gp) {
+            const auto &verts = vertex_map.at(id);
+            left_id leftfix(g[verts[0]].left);
+
+            bool is_transient = false;
+            auto n = makeLeftNfa(build, leftfix, false, is_transient,
+                                 infixTriggers, cc);
+            assert(n);
+
+            setLeftNfaProperties(*n, leftfix);
+
+            ExclusiveSubengine engine;
+            engine.nfa = move(n);
+            engine.vertices = verts;
+            info.subengines.push_back(move(engine));
+        }
+        info.queue = qif.get_queue();
+        exclusive_info.push_back(move(info));
+    }
+    updateExclusiveInfixProperties(build, bc, exclusive_info,
+                                   no_retrigger_queues);
+    buildInfixContainer(g, bc, exclusive_info);
+}
+
+static
+void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
+                          QueueIndexFactory &qif,
+                          const map<left_id, set<PredTopPair>> &infixTriggers,
+                          set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+
+    set<RoleInfo<left_id>> roleInfoSet;
+    map<u32, vector<RoseVertex>> vertex_map;
+
+    u32 role_id = 0;
+    map<left_id, u32> leftfixes;
     for (auto v : vertices_range(g)) {
-        if (!g[v].left) {
+        if (!g[v].left || build.isRootSuccessor(v)) {
             continue;
         }
 
+        left_id leftfix(g[v].left);
+
+        // Sanity check: our NFA should contain each of the tops mentioned on
+        // our in-edges.
+        assert(roseHasTops(g, v));
+
+        if (contains(leftfixes, leftfix)) {
+            // NFA already built.
+            u32 id = leftfixes[leftfix];
+            if (contains(vertex_map, id)) {
+                vertex_map[id].push_back(v);
+            }
+            DEBUG_PRINTF("sharing leftfix, id=%u\n", id);
+            continue;
+        }
+
+        if (leftfix.graph() || leftfix.castle()) {
+            leftfixes.emplace(leftfix, role_id);
+            vertex_map[role_id].push_back(v);
+
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(build, infixTriggers.at(leftfix), &triggers);
+            RoleInfo<left_id> info(leftfix, role_id);
+            if (setTriggerLiteralsInfix(info, triggers)) {
+                roleInfoSet.insert(info);
+            }
+            role_id++;
+        }
+    }
+
+    if (leftfixes.size() > 1) {
+        DEBUG_PRINTF("leftfix size:%lu\n", leftfixes.size());
+        vector<vector<u32>> groups;
+        exclusiveAnalysisInfix(build, vertex_map, roleInfoSet, groups);
+        buildExclusiveInfixes(build, bc, qif, infixTriggers, vertex_map,
+                              groups, no_retrigger_queues);
+    }
+}
+
+static
+bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
+                    QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
+                    set<u32> *eager_queues, bool do_prefix) {
+    RoseGraph &g = tbi.g;
+    const CompileContext &cc = tbi.cc;
+
+    map<left_id, set<PredTopPair> > infixTriggers;
+    vector<left_id> order;
+    unordered_map<left_id, vector<RoseVertex> > succs;
+    findInfixTriggers(tbi, &infixTriggers);
+
+    if (cc.grey.allowTamarama && cc.streaming && !do_prefix) {
+        findExclusiveInfixes(tbi, bc, qif, infixTriggers,
+                             no_retrigger_queues);
+    }
+
+    for (auto v : vertices_range(g)) {
+        if (!g[v].left || g[v].left.tamarama) {
+            continue;
+        }
+
+        assert(tbi.isNonRootSuccessor(v) != tbi.isRootSuccessor(v));
         bool is_prefix = tbi.isRootSuccessor(v);
 
         if (do_prefix != is_prefix) {
@@ -1132,11 +1787,13 @@ bool buildLeftfixes(const RoseBuildImpl &tbi, build_context &bc,
         // our in-edges.
         assert(roseHasTops(g, v));
 
-        u32 qi; // queue index, set below.
-        u32 lag = g[v].left.lag;
         bool is_transient = contains(tbi.transient, leftfix);
 
-        if (is_transient && tbi.cc.grey.roseLookaroundMasks) {
+        // Transient leftfixes can sometimes be implemented solely with
+        // lookarounds, in which case we don't need to build an engine.
+        // TODO: Handle SOM-tracking cases as well.
+        if (cc.grey.roseLookaroundMasks && is_transient &&
+            !g[v].left.tracksSom()) {
             vector<LookEntry> lookaround;
             if (makeLeftfixLookaround(tbi, v, lookaround)) {
                 DEBUG_PRINTF("implementing as lookaround!\n");
@@ -1145,94 +1802,42 @@ bool buildLeftfixes(const RoseBuildImpl &tbi, build_context &bc,
             }
         }
 
-        if (contains(seen, leftfix)) {
-            // NFA already built.
-            qi = seen[leftfix];
-            assert(contains(bc.engineOffsets, qi));
-            DEBUG_PRINTF("sharing leftfix, qi=%u\n", qi);
-        } else {
-            DEBUG_PRINTF("making %sleftfix\n", is_transient ? "transient " : "");
-
-            aligned_unique_ptr<NFA> nfa;
-
-            // Need to build NFA, which is either predestined to be a Haig (in
-            // SOM mode) or could be all manner of things.
-            if (leftfix.haig()) {
-                nfa = goughCompile(*leftfix.haig(), tbi.ssm.somPrecision(), cc,
-                                   rm);
-            }  else {
-                assert(tbi.isNonRootSuccessor(v) != tbi.isRootSuccessor(v));
-                nfa = makeLeftNfa(tbi, leftfix, is_prefix, is_transient,
-                                  infixTriggers, cc);
-            }
-
-            if (!nfa) {
-                assert(!"failed to build leftfix");
-                return false;
-            }
-
-            setLeftNfaProperties(*nfa, leftfix);
-
-            qi = qif.get_queue();
-            nfa->queueIndex = qi;
-
-            if (!is_prefix && !leftfix.haig() && leftfix.graph() &&
-                nfaStuckOn(*leftfix.graph())) {
-                DEBUG_PRINTF("%u sticks on\n", qi);
-                no_retrigger_queues->insert(qi);
-            }
-
-            DEBUG_PRINTF("built leftfix, qi=%u\n", qi);
-            add_nfa_to_blob(bc, *nfa);
-            seen.emplace(leftfix, qi);
+        if (!contains(succs, leftfix)) {
+            order.push_back(leftfix);
         }
 
+        succs[leftfix].push_back(v);
+    }
+
+    rose_group initial_groups = tbi.getInitialGroups();
+    rose_group combined_eager_squashed_mask = ~0ULL;
+
+    map<left_id, eager_info> eager;
+
+    for (const left_id &leftfix : order) {
+        const auto &left_succs = succs[leftfix];
+
         rose_group squash_mask = tbi.rose_squash_masks.at(leftfix);
+        eager_info ei;
 
-        // Leftfixes can have stop alphabets.
-        vector<u8> stop(N_CHARS, 0);
-        /* haigs track som information - need more care */
-        som_type som = leftfix.haig() ? SOM_LEFT : SOM_NONE;
-        if (leftfix.graph()) {
-            stop = findLeftOffsetStopAlphabet(*leftfix.graph(), som);
-        } else if (leftfix.castle()) {
-            stop = findLeftOffsetStopAlphabet(*leftfix.castle(), som);
+        if (checkSuitableForEager(do_prefix, leftfix, tbi, left_succs,
+                                  squash_mask, initial_groups, ei, cc)) {
+            eager[leftfix] = ei;
+            combined_eager_squashed_mask &= squash_mask;
+            DEBUG_PRINTF("combo %016llx...\n", combined_eager_squashed_mask);
         }
+    }
 
-        // Infix NFAs can have bounds on their queue lengths.
-        u32 max_queuelen = UINT32_MAX;
-        if (!is_prefix) {
-            set<ue2_literal> lits;
-            for (auto u : inv_adjacent_vertices_range(v, tbi.g)) {
-                for (u32 lit_id : tbi.g[u].literals) {
-                    lits.insert(tbi.literals.right.at(lit_id).s);
-                }
-            }
-            DEBUG_PRINTF("%zu literals\n", lits.size());
-            max_queuelen = findMaxInfixMatches(leftfix, lits);
-            if (max_queuelen < UINT32_MAX) {
-                max_queuelen++;
-            }
-        }
+    if (do_prefix && combined_eager_squashed_mask & initial_groups) {
+        DEBUG_PRINTF("eager groups won't squash everyone - be lazy\n");
+        eager_queues->clear();
+        eager.clear();
+    }
 
-        u32 max_width;
-        if (is_transient) {
-            depth d = findMaxWidth(leftfix);
-            assert(d.is_finite());
-            max_width = d;
-        } else {
-            max_width = 0;
-        }
-
-        u8 cm_count = 0;
-        CharReach cm_cr;
-        if (cc.grey.allowCountingMiracles) {
-            findCountingMiracleInfo(leftfix, stop, &cm_count, &cm_cr);
-        }
-
-        bc.leftfix_info.emplace(
-            v, left_build_info(qi, lag, max_width, squash_mask, stop,
-                               max_queuelen, cm_count, cm_cr));
+    for (const left_id &leftfix : order) {
+        buildLeftfix(tbi, bc, do_prefix, qif.get_queue(), infixTriggers,
+                     no_retrigger_queues, eager_queues, eager, succs[leftfix],
+                     leftfix);
     }
 
     return true;
@@ -1271,8 +1876,8 @@ public:
     };
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
-        // Unleash the McClellan!
-        return mcclellanCompile(*rdfa, build.cc, build.rm);
+        // Unleash the mighty DFA!
+        return getDfa(*rdfa, build.cc, build.rm);
     }
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
@@ -1300,7 +1905,7 @@ public:
             !has_bounded_repeats_other_than_firsts(*n)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
-                auto d = mcclellanCompile(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, cc, rm);
                 if (d) {
                     n = pickImpl(move(d), move(n));
                 }
@@ -1458,6 +2063,7 @@ void assignSuffixQueues(RoseBuildImpl &build, build_context &bc) {
         u32 queue = build.qif.get_queue();
         DEBUG_PRINTF("assigning %p to queue %u\n", s.graph(), queue);
         bc.suffixes.emplace(s, queue);
+        build.suffix_queue_map.emplace(s, queue);
     }
 }
 
@@ -1481,11 +2087,111 @@ void setSuffixProperties(NFA &n, const suffix_id &suff,
 }
 
 static
-bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
-                   set<u32> *no_retrigger_queues) {
-    map<suffix_id, set<PredTopPair> > suffixTriggers;
-    findSuffixTriggers(tbi, &suffixTriggers);
+void buildExclusiveSuffixes(RoseBuildImpl &build, build_context &bc,
+                            QueueIndexFactory &qif,
+                            map<suffix_id, set<PredTopPair>> &suffixTriggers,
+                            const map<u32, vector<RoseVertex>> &vertex_map,
+                            const vector<vector<u32>> &groups,
+                            set<u32> *no_retrigger_queues) {
+    RoseGraph &g = build.g;
 
+    vector<ExclusiveInfo> exclusive_info;
+    for (const auto &gp : groups) {
+        ExclusiveInfo info;
+        for (const auto &id : gp) {
+            const auto &verts = vertex_map.at(id);
+            suffix_id s(g[verts[0]].suffix);
+
+            const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
+
+            map<u32, u32> fixed_depth_tops;
+            findFixedDepthTops(g, s_triggers, &fixed_depth_tops);
+
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(build, s_triggers, &triggers);
+
+            auto n = buildSuffix(build.rm, build.ssm, fixed_depth_tops,
+                                 triggers, s, build.cc);
+            assert(n);
+
+            setSuffixProperties(*n, s, build.rm);
+
+            ExclusiveSubengine engine;
+            engine.nfa = move(n);
+            engine.vertices = verts;
+            info.subengines.push_back(move(engine));
+
+            const auto &reports = all_reports(s);
+            info.reports.insert(reports.begin(), reports.end());
+        }
+        info.queue = qif.get_queue();
+        exclusive_info.push_back(move(info));
+    }
+    updateExclusiveSuffixProperties(build, exclusive_info,
+                                    no_retrigger_queues);
+    buildSuffixContainer(g, bc, exclusive_info);
+}
+
+static
+void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
+                  QueueIndexFactory &qif,
+                  map<suffix_id, set<PredTopPair>> &suffixTriggers,
+                  set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = tbi.g;
+
+    map<suffix_id, u32> suffixes;
+    set<RoleInfo<suffix_id>> roleInfoSet;
+    map<u32, vector<RoseVertex>> vertex_map;
+    u32 role_id = 0;
+    for (auto v : vertices_range(g)) {
+        if (!g[v].suffix) {
+            continue;
+        }
+
+        const suffix_id s(g[v].suffix);
+
+        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].idx, s.graph());
+
+        // We may have already built this NFA.
+        if (contains(suffixes, s)) {
+            u32 id = suffixes[s];
+            if (!tbi.isInETable(v)) {
+                vertex_map[id].push_back(v);
+            }
+            continue;
+        }
+
+        // Currently disable eod suffixes for exclusive analysis
+        if (!tbi.isInETable(v) && (s.graph() || s.castle())) {
+            DEBUG_PRINTF("assigning %p to id %u\n", s.graph(), role_id);
+            suffixes.emplace(s, role_id);
+
+            vertex_map[role_id].push_back(v);
+            const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(tbi, s_triggers, &triggers);
+
+            RoleInfo<suffix_id> info(s, role_id);
+            if (setTriggerLiteralsSuffix(info, triggers)) {
+                roleInfoSet.insert(info);
+            }
+            role_id++;
+        }
+    }
+
+    if (suffixes.size() > 1) {
+        DEBUG_PRINTF("suffix size:%lu\n", suffixes.size());
+        vector<vector<u32>> groups;
+        exclusiveAnalysisSuffix(tbi, vertex_map, roleInfoSet, groups);
+        buildExclusiveSuffixes(tbi, bc, qif, suffixTriggers, vertex_map,
+                               groups, no_retrigger_queues);
+    }
+}
+
+static
+bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
+                   set<u32> *no_retrigger_queues,
+                   const map<suffix_id, set<PredTopPair>> &suffixTriggers) {
     // To ensure compile determinism, build suffix engines in order of their
     // (unique) queue indices, so that we call add_nfa_to_blob in the same
     // order.
@@ -1498,6 +2204,11 @@ bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
     for (const auto &e : ordered) {
         const u32 queue = e.first;
         const suffix_id &s = e.second;
+
+        if (s.tamarama()) {
+            continue;
+        }
+
         const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
 
         map<u32, u32> fixed_depth_tops;
@@ -1583,22 +2294,35 @@ void buildCountingMiracles(RoseBuildImpl &build, build_context &bc) {
     }
 }
 
+/* Note: buildNfas may reduce the lag for vertices that have prefixes */
 static
 bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
-               set<u32> *no_retrigger_queues, u32 *leftfixBeginQueue) {
+               set<u32> *no_retrigger_queues, set<u32> *eager_queues,
+               u32 *leftfixBeginQueue) {
+    map<suffix_id, set<PredTopPair>> suffixTriggers;
+    findSuffixTriggers(tbi, &suffixTriggers);
+
+    if (tbi.cc.grey.allowTamarama && tbi.cc.streaming) {
+        findExclusiveSuffixes(tbi, bc, qif, suffixTriggers,
+                              no_retrigger_queues);
+    }
+
     assignSuffixQueues(tbi, bc);
 
-    if (!buildSuffixes(tbi, bc, no_retrigger_queues)) {
+    if (!buildSuffixes(tbi, bc, no_retrigger_queues, suffixTriggers)) {
         return false;
     }
+    suffixTriggers.clear();
 
     *leftfixBeginQueue = qif.allocated_count();
 
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, true)) {
+    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
+                        true)) {
         return false;
     }
 
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, false)) {
+    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
+                        false)) {
         return false;
     }
 
@@ -1642,10 +2366,10 @@ static
 void findTransientQueues(const map<RoseVertex, left_build_info> &leftfix_info,
                          set<u32> *out) {
     DEBUG_PRINTF("curating transient queues\n");
-    for (const auto &rbi : leftfix_info | map_values) {
-        if (rbi.transient) {
-            DEBUG_PRINTF("q %u is transient\n", rbi.queue);
-            out->insert(rbi.queue);
+    for (const auto &build : leftfix_info | map_values) {
+        if (build.transient) {
+            DEBUG_PRINTF("q %u is transient\n", build.queue);
+            out->insert(build.queue);
         }
     }
 }
@@ -1750,24 +2474,18 @@ u32 addIteratorToTable(build_context &bc,
     return offset;
 }
 
-static
-bool hasLastByteHistoryOutEdge(const RoseGraph &g, RoseVertex v) {
-    for (const auto &e : out_edges_range(v, g)) {
-        if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
     vector<u32> lb_roles;
 
     for (auto v : vertices_range(g)) {
-        if (hasLastByteHistoryOutEdge(g, v)) {
-            assert(contains(bc.roleStateIndices, v));
-            lb_roles.push_back(bc.roleStateIndices.at(v));
+        if (!hasLastByteHistorySucc(g, v)) {
+            continue;
+        }
+        // Eager EOD reporters won't have state indices.
+        auto it = bc.roleStateIndices.find(v);
+        if (it != end(bc.roleStateIndices)) {
+            lb_roles.push_back(it->second);
         }
     }
 
@@ -1933,16 +2651,6 @@ bool anyEndfixMpvTriggers(const RoseBuildImpl &tbi) {
     return false;
 }
 
-static
-bool hasInternalReport(const set<ReportID> &reports, const ReportManager &rm) {
-    for (ReportID r : reports) {
-        if (!isExternalReport(rm.getReport(r))) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 void populateNfaInfoBasics(const RoseBuildImpl &build, const build_context &bc,
                            const vector<OutfixInfo> &outfixes,
@@ -1960,24 +2668,10 @@ void populateNfaInfoBasics(const RoseBuildImpl &build, const build_context &bc,
         info.no_retrigger = contains(no_retrigger_queues, qi) ? 1 : 0;
     }
 
-    // Mark outfixes that only trigger external reports.
+    // Mark outfixes that are in the small block matcher.
     for (const auto &out : outfixes) {
         const u32 qi = out.get_queue();
-
         infos[qi].in_sbmatcher = out.in_sbmatcher;
-        if (!hasInternalReport(all_reports(out), build.rm)) {
-            infos[qi].only_external = 1;
-        }
-    }
-
-    // Mark suffixes that only trigger external reports.
-    for (const auto &e : bc.suffixes) {
-        const suffix_id &s = e.first;
-        u32 qi = e.second;
-
-        if (!hasInternalReport(all_reports(s), build.rm)) {
-            infos[qi].only_external = 1;
-        }
     }
 
     // Mark suffixes triggered by EOD table literals.
@@ -2150,6 +2844,12 @@ flattenProgram(const vector<vector<RoseInstruction>> &programs) {
         case ROSE_INSTR_CHECK_LOOKAROUND:
             ri.u.checkLookaround.fail_jump = jump_val;
             break;
+        case ROSE_INSTR_CHECK_MASK:
+            ri.u.checkMask.fail_jump = jump_val;
+            break;
+        case ROSE_INSTR_CHECK_BYTE:
+            ri.u.checkByte.fail_jump = jump_val;
+            break;
         case ROSE_INSTR_CHECK_INFIX:
             ri.u.checkInfix.fail_jump = jump_val;
             break;
@@ -2263,6 +2963,18 @@ void recordResources(RoseResources &resources,
             break;
         }
     }
+
+    const auto &g = build.g;
+    for (const auto &v : vertices_range(g)) {
+        if (g[v].eod_accept) {
+            resources.has_eod = true;
+            break;
+        }
+        if (g[v].suffix && has_eod_accepts(g[v].suffix)) {
+            resources.has_eod = true;
+            break;
+        }
+    }
 }
 
 static
@@ -2328,7 +3040,37 @@ void buildActiveLeftIter(const vector<LeftNfaInfo> &leftTable,
 }
 
 static
-bool hasEodAnchors(const RoseBuildImpl &tbi, const build_context &bc,
+bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) {
+    const auto &g = build.g;
+    const auto v = target(e, g);
+
+    if (!build.g[v].eod_accept) {
+        return false;
+    }
+
+    // If there's a graph between us and EOD, we shouldn't be eager.
+    if (build.g[v].left) {
+        return false;
+    }
+
+    // Must be exactly at EOD.
+    if (g[e].minBound != 0 || g[e].maxBound != 0) {
+        return false;
+    }
+
+    // In streaming mode, we can only eagerly report EOD for literals in the
+    // EOD-anchored table, as that's the only time we actually know where EOD
+    // is. In block mode, we always have this information.
+    const auto u = source(e, g);
+    if (build.cc.streaming && !build.isInETable(u)) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+bool hasEodAnchors(const RoseBuildImpl &build, const build_context &bc,
                    u32 outfixEndQueue) {
     for (u32 i = 0; i < outfixEndQueue; i++) {
         if (nfaAcceptsEod(get_nfa_from_blob(bc, i))) {
@@ -2337,16 +3079,18 @@ bool hasEodAnchors(const RoseBuildImpl &tbi, const build_context &bc,
         }
     }
 
-    if (tbi.eod_event_literal_id != MO_INVALID_IDX) {
+    if (build.eod_event_literal_id != MO_INVALID_IDX) {
         DEBUG_PRINTF("eod is an event to be celebrated\n");
         return true;
     }
-    for (auto v : vertices_range(tbi.g)) {
-        if (tbi.g[v].eod_accept) {
+
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].eod_accept) {
             DEBUG_PRINTF("literally report eod\n");
             return true;
         }
-        if (tbi.g[v].suffix && has_eod_accepts(tbi.g[v].suffix)) {
+        if (g[v].suffix && has_eod_accepts(g[v].suffix)) {
             DEBUG_PRINTF("eod suffix\n");
             return true;
         }
@@ -2432,6 +3176,122 @@ bool onlyAtEod(const RoseBuildImpl &tbi, RoseVertex v) {
     return true;
 }
 
+static
+u32 addLookaround(build_context &bc, const vector<LookEntry> &look) {
+    // Check the cache.
+    auto it = bc.lookaround_cache.find(look);
+    if (it != bc.lookaround_cache.end()) {
+        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
+        return verify_u32(it->second);
+    }
+
+    // Linear scan for sequence.
+    auto seq_it = search(begin(bc.lookaround), end(bc.lookaround), begin(look),
+                         end(look));
+    if (seq_it != end(bc.lookaround)) {
+        size_t idx = distance(begin(bc.lookaround), seq_it);
+        DEBUG_PRINTF("linear scan found look at idx %zu\n", idx);
+        bc.lookaround_cache.emplace(look, idx);
+        return verify_u32(idx);
+    }
+
+    // New sequence.
+    size_t idx = bc.lookaround.size();
+    bc.lookaround_cache.emplace(look, idx);
+    insert(&bc.lookaround, bc.lookaround.end(), look);
+    DEBUG_PRINTF("adding look at idx %zu\n", idx);
+    return verify_u32(idx);
+}
+
+static
+bool checkReachMask(const CharReach &cr, u8 &andmask, u8 &cmpmask) {
+    size_t reach_size = cr.count();
+    assert(reach_size > 0);
+    // check whether entry_size is some power of 2.
+    if ((reach_size - 1) & reach_size) {
+        return false;
+    }
+    make_and_cmp_mask(cr, &andmask, &cmpmask);
+    if ((1 << popcount32((u8)(~andmask))) ^ reach_size) {
+        return false;
+    }
+    return true;
+}
+
+static
+bool checkReachWithFlip(const CharReach &cr, u8 &andmask,
+                       u8 &cmpmask, u8 &flip) {
+    if (checkReachMask(cr, andmask, cmpmask)) {
+        flip = 0;
+        return true;
+    }
+    if (checkReachMask(~cr, andmask, cmpmask)) {
+        flip = 1;
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleByte(const vector<LookEntry> &look,
+                  vector<RoseInstruction> &program) {
+    if (look.size() == 1) {
+        const auto &entry = look[0];
+        u8 andmask_u8, cmpmask_u8;
+        u8 flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
+            return false;
+        }
+        s32 checkbyte_offset = verify_s32(entry.offset);
+        DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
+        auto ri = RoseInstruction(ROSE_INSTR_CHECK_BYTE,
+                                  JumpTarget::NEXT_BLOCK);
+        ri.u.checkByte.and_mask = andmask_u8;
+        ri.u.checkByte.cmp_mask = cmpmask_u8;
+        ri.u.checkByte.negation = flip;
+        ri.u.checkByte.offset = checkbyte_offset;
+        program.push_back(ri);
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleMask(const vector<LookEntry> &look,
+                  vector<RoseInstruction> &program) {
+    if (look.back().offset < look.front().offset + 8) {
+        s32 base_offset = verify_s32(look.front().offset);
+        u64a and_mask = 0;
+        u64a cmp_mask = 0;
+        u64a neg_mask = 0;
+        for (const auto &entry : look) {
+            u8 andmask_u8, cmpmask_u8, flip;
+            if (!checkReachWithFlip(entry.reach, andmask_u8,
+                                    cmpmask_u8, flip)) {
+                return false;
+            }
+            DEBUG_PRINTF("entry offset %d\n", entry.offset);
+            u32 shift = (entry.offset - base_offset) << 3;
+            and_mask |= (u64a)andmask_u8 << shift;
+            cmp_mask |= (u64a)cmpmask_u8 << shift;
+            if (flip) {
+                neg_mask |= 0xffLLU << shift;
+            }
+        }
+        DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
+                     and_mask, cmp_mask);
+        auto ri = RoseInstruction(ROSE_INSTR_CHECK_MASK,
+                                  JumpTarget::NEXT_BLOCK);
+        ri.u.checkMask.and_mask = and_mask;
+        ri.u.checkMask.cmp_mask = cmp_mask;
+        ri.u.checkMask.neg_mask = neg_mask;
+        ri.u.checkMask.offset = base_offset;
+        program.push_back(ri);
+        return true;
+    }
+    return false;
+}
+
 static
 void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
                         vector<RoseInstruction> &program) {
@@ -2457,19 +3317,16 @@ void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         return;
     }
 
-    DEBUG_PRINTF("role has lookaround\n");
-    u32 look_idx;
-    auto it = bc.lookaround_cache.find(look);
-    if (it != bc.lookaround_cache.end()) {
-        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
-        look_idx = verify_u32(it->second);
-    } else {
-        size_t idx = bc.lookaround.size();
-        bc.lookaround_cache.emplace(look, idx);
-        insert(&bc.lookaround, bc.lookaround.end(), look);
-        DEBUG_PRINTF("adding look at idx %zu\n", idx);
-        look_idx = verify_u32(idx);
+    if (makeRoleByte(look, program)) {
+        return;
     }
+
+    if (makeRoleMask(look, program)) {
+        return;
+    }
+
+    DEBUG_PRINTF("role has lookaround\n");
+    u32 look_idx = addLookaround(bc, look);
     u32 look_count = verify_u32(look.size());
 
     auto ri = RoseInstruction(ROSE_INSTR_CHECK_LOOKAROUND,
@@ -2875,7 +3732,15 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
     assert(contains(bc.engineOffsets, qi));
     const NFA *nfa = get_nfa_from_blob(bc, qi);
     u32 suffixEvent;
-    if (isMultiTopType(nfa->type)) {
+    if (isContainerType(nfa->type)) {
+        auto tamaProto = g[v].suffix.tamarama.get();
+        assert(tamaProto);
+        u32 top = (u32)MQE_TOP_FIRST +
+                  tamaProto->top_remap.at(make_pair(g[v].idx,
+                                                    g[v].suffix.top));
+        assert(top < MQE_INVALID);
+        suffixEvent = top;
+    } else if (isMultiTopType(nfa->type)) {
         assert(!g[v].suffix.haig);
         u32 top = (u32)MQE_TOP_FIRST + g[v].suffix.top;
         assert(top < MQE_INVALID);
@@ -2893,11 +3758,38 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
 }
 
 static
-void makeRoleGroups(const rose_group &groups,
+void makeRoleGroups(RoseBuildImpl &build, build_context &bc, RoseVertex v,
                     vector<RoseInstruction> &program) {
+    const auto &g = build.g;
+    rose_group groups = g[v].groups;
     if (!groups) {
         return;
     }
+
+    // The set of "already on" groups as we process this vertex is the
+    // intersection of the groups set by our predecessors.
+    assert(in_degree(v, g) > 0);
+    rose_group already_on = ~rose_group{0};
+    for (const auto &u : inv_adjacent_vertices_range(v, g)) {
+        already_on &= bc.vertex_group_map.at(u);
+    }
+
+    DEBUG_PRINTF("already_on=0x%llx\n", already_on);
+    DEBUG_PRINTF("squashable=0x%llx\n", bc.squashable_groups);
+    DEBUG_PRINTF("groups=0x%llx\n", groups);
+
+    already_on &= ~bc.squashable_groups;
+    DEBUG_PRINTF("squashed already_on=0x%llx\n", already_on);
+
+    // We don't *have* to mask off the groups that we know are already on, but
+    // this will make bugs more apparent.
+    groups &= ~already_on;
+
+    if (!groups) {
+        DEBUG_PRINTF("no new groups to set, skipping\n");
+        return;
+    }
+
     auto ri = RoseInstruction(ROSE_INSTR_SET_GROUPS);
     ri.u.setGroups.groups = groups;
     program.push_back(ri);
@@ -2926,7 +3818,13 @@ void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
 
         // DFAs have no TOP_N support, so they get a classic MQE_TOP event.
         u32 top;
-        if (!isMultiTopType(nfa->type)) {
+        if (isContainerType(nfa->type)) {
+            auto tamaProto = g[v].left.tamarama.get();
+            assert(tamaProto);
+            top = MQE_TOP_FIRST + tamaProto->top_remap.at(
+                                      make_pair(g[v].idx, g[e].rose_top));
+            assert(top < MQE_INVALID);
+        } else if (!isMultiTopType(nfa->type)) {
             assert(num_tops(g[v].left) == 1);
             top = MQE_TOP;
         } else {
@@ -2989,7 +3887,7 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
                          : g[e].maxBound + lit_length;
 
     if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
-        assert(g[u].max_offset != ROSE_BOUND_INF);
+        assert(g[u].fixedOffset());
         // Make offsets absolute.
         min_bound += g[u].max_offset;
         if (max_bound != ROSE_BOUND_INF) {
@@ -3006,6 +3904,10 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
         max_bound = MAX_OFFSET;
     }
 
+    // This instruction should be doing _something_ -- bounds should be tighter
+    // than just {length, inf}.
+    assert(min_bound > lit_length || max_bound < MAX_OFFSET);
+
     auto ri = RoseInstruction(ROSE_INSTR_CHECK_BOUNDS, JumpTarget::NEXT_BLOCK);
     ri.u.checkBounds.min_bound = min_bound;
     ri.u.checkBounds.max_bound = max_bound;
@@ -3032,6 +3934,30 @@ void makeRoleCheckNotHandled(build_context &bc, RoseVertex v,
     program.push_back(move(ri));
 }
 
+static
+void makeRoleEagerEodReports(RoseBuildImpl &build, build_context &bc,
+                             RoseVertex v, vector<RoseInstruction> &program) {
+    vector<RoseInstruction> eod_program;
+
+    for (const auto &e : out_edges_range(v, build.g)) {
+        if (canEagerlyReportAtEod(build, e)) {
+            makeRoleReports(build, bc, target(e, build.g), eod_program);
+        }
+    }
+
+    if (eod_program.empty()) {
+        return;
+    }
+
+    if (!onlyAtEod(build, v)) {
+        // The rest of our program wasn't EOD anchored, so we need to guard
+        // these reports with a check.
+        program.emplace_back(ROSE_INSTR_CHECK_ONLY_EOD, JumpTarget::NEXT_BLOCK);
+    }
+
+    program.insert(end(program), begin(eod_program), end(eod_program));
+}
+
 static
 vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
                                     const RoseEdge &e) {
@@ -3068,15 +3994,21 @@ vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
     // Next, we can add program instructions that have effects.
 
     makeRoleReports(build, bc, v, program);
+
     makeRoleInfixTriggers(build, bc, v, program);
 
     // Note: SET_GROUPS instruction must be after infix triggers, as an infix
     // going dead may switch off groups.
-    makeRoleGroups(g[v].groups, program);
+    makeRoleGroups(build, bc, v, program);
 
     makeRoleSuffix(build, bc, v, program);
+
     makeRoleSetState(bc, v, program);
 
+    // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
+    // the program doesn't have one already).
+    makeRoleEagerEodReports(build, bc, v, program);
+
     return program;
 }
 
@@ -3135,10 +4067,21 @@ void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
         if (build.isVirtualVertex(v)) {
             continue;
         }
-        // Leaf nodes don't need state indices, as they don't have successors.
-        if (isLeafNode(v, g)) {
+
+        // We only need a state index if we have successors that are not
+        // eagerly-reported EOD vertices.
+        bool needs_state_index = false;
+        for (const auto &e : out_edges_range(v, g)) {
+            if (!canEagerlyReportAtEod(build, e)) {
+                needs_state_index = true;
+                break;
+            }
+        }
+
+        if (!needs_state_index) {
             continue;
         }
+
         /* TODO: also don't need a state index if all edges are nfa based */
         bc.roleStateIndices.emplace(v, state++);
     }
@@ -3149,9 +4092,9 @@ void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
 }
 
 static
-bool hasUsefulStops(const left_build_info &rbi) {
+bool hasUsefulStops(const left_build_info &build) {
     for (u32 i = 0; i < N_CHARS; i++) {
-        if (rbi.stopAlphabet[i]) {
+        if (build.stopAlphabet[i]) {
             return true;
         }
     }
@@ -3160,6 +4103,7 @@ bool hasUsefulStops(const left_build_info &rbi) {
 
 static
 void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
+                        const set<u32> &eager_queues,
                         u32 leftfixBeginQueue, u32 leftfixCount,
                         vector<LeftNfaInfo> &leftTable, u32 *laggedRoseCount,
                         size_t *history) {
@@ -3219,6 +4163,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
             DEBUG_PRINTF("mw = %u\n", lbi.transient);
             left.transient = verify_u8(lbi.transient);
             left.infix = tbi.isNonRootSuccessor(v);
+            left.eager = contains(eager_queues, lbi.queue);
 
             // A rose has a lagIndex if it's non-transient and we are
             // streaming.
@@ -3249,7 +4194,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-u32 addPredBlocksSingle(
+void addPredBlocksSingle(
     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
     vector<RoseInstruction> &program) {
 
@@ -3257,6 +4202,7 @@ u32 addPredBlocksSingle(
 
     for (const auto &m : predProgramLists) {
         const u32 &pred_state = m.first;
+        assert(!m.second.empty());
         auto subprog = flattenProgram(m.second);
 
         // Check our pred state.
@@ -3271,7 +4217,6 @@ u32 addPredBlocksSingle(
 
     auto prog = flattenProgram(prog_blocks);
     program.insert(end(program), begin(prog), end(prog));
-    return 0; // No iterator.
 }
 
 static
@@ -3284,7 +4229,7 @@ u32 programLength(const vector<RoseInstruction> &program) {
 }
 
 static
-u32 addPredBlocksMulti(build_context &bc,
+void addPredBlocksMulti(build_context &bc,
                     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
                     vector<RoseInstruction> &program) {
     assert(!predProgramLists.empty());
@@ -3314,6 +4259,7 @@ u32 addPredBlocksMulti(build_context &bc,
         DEBUG_PRINTF("subprogram %zu has offset %u\n", jump_table.size(),
                      curr_offset);
         jump_table.push_back(curr_offset);
+        assert(!e.second.empty());
         auto subprog = flattenProgram(e.second);
 
         if (e.first != keys.back()) {
@@ -3359,31 +4305,31 @@ u32 addPredBlocksMulti(build_context &bc,
     }
 
     program.insert(end(program), begin(sparse_program), end(sparse_program));
-
-    return iter_offset;
 }
 
 static
-u32 addPredBlocks(build_context &bc,
-                  map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-                  vector<RoseInstruction> &program,
-                  bool force_sparse_iter) {
+void addPredBlocks(build_context &bc,
+                   map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
+                   vector<RoseInstruction> &program) {
     const size_t num_preds = predProgramLists.size();
     if (num_preds == 0) {
         program = flattenProgram({program});
-        return 0; // No iterator.
-    } else if (!force_sparse_iter && num_preds == 1) {
-        return addPredBlocksSingle(predProgramLists, program);
-    } else {
-        return addPredBlocksMulti(bc, predProgramLists, program);
+        return;
     }
+
+    if (num_preds == 1) {
+        addPredBlocksSingle(predProgramLists, program);
+        return;
+    }
+
+    addPredBlocksMulti(bc, predProgramLists, program);
 }
 
 /**
  * Returns the pair (program offset, sparse iter offset).
  */
 static
-pair<u32, u32> makeSparseIterProgram(build_context &bc,
+vector<RoseInstruction> makeSparseIterProgram(build_context &bc,
                     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
                     const vector<RoseInstruction> &root_program,
                     const vector<RoseInstruction> &pre_program) {
@@ -3399,7 +4345,7 @@ pair<u32, u32> makeSparseIterProgram(build_context &bc,
     // Add blocks to deal with non-root edges (triggered by sparse iterator or
     // mmbit_isset checks). This operation will flatten the program up to this
     // point.
-    u32 iter_offset = addPredBlocks(bc, predProgramLists, program, false);
+    addPredBlocks(bc, predProgramLists, program);
 
     // If we have a root program, replace the END instruction with it. Note
     // that the root program has already been flattened.
@@ -3410,8 +4356,7 @@ pair<u32, u32> makeSparseIterProgram(build_context &bc,
         program.insert(end(program), begin(root_program), end(root_program));
     }
 
-    applyFinalSpecialisation(program);
-    return {writeProgram(bc, program), iter_offset};
+    return program;
 }
 
 static
@@ -3439,8 +4384,7 @@ void makePushDelayedInstructions(const RoseBuildImpl &build, u32 final_id,
 }
 
 static
-void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
-                               vector<RoseInstruction> &program) {
+rose_group getFinalIdGroupsUnion(const RoseBuildImpl &build, u32 final_id) {
     assert(contains(build.final_id_to_literal, final_id));
     const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
 
@@ -3448,7 +4392,13 @@ void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
     for (const auto &li : lit_infos) {
         groups |= li->group_mask;
     }
+    return groups;
+}
 
+static
+void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
+                               vector<RoseInstruction> &program) {
+    rose_group groups = getFinalIdGroupsUnion(build, final_id);
     if (!groups) {
         return;
     }
@@ -3497,11 +4447,7 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
         return;
     }
 
-    rose_group groups = 0;
-    for (const auto &li : lit_infos) {
-        groups |= li->group_mask;
-    }
-
+    rose_group groups = getFinalIdGroupsUnion(build, final_id);
     if (!groups) {
         return;
     }
@@ -3513,6 +4459,62 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
     program.push_back(move(ri));
 }
 
+static
+u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 max_offset = 0;
+    for (const auto &v : lit_vertices) {
+        max_offset = max(max_offset, build.g[v].max_offset);
+    }
+
+    return max_offset;
+}
+
+static
+void makeRecordAnchoredInstruction(const RoseBuildImpl &build,
+                                   build_context &bc, u32 final_id,
+                                   vector<RoseInstruction> &program) {
+    assert(contains(build.final_id_to_literal, final_id));
+    const auto &lit_ids = build.final_id_to_literal.at(final_id);
+
+    // Must be anchored.
+    assert(!lit_ids.empty());
+    if (build.literals.right.at(*begin(lit_ids)).table != ROSE_ANCHORED) {
+        return;
+    }
+
+    // If this anchored literal can never match past
+    // floatingMinLiteralMatchOffset, we will never have to record it.
+    u32 max_offset = 0;
+    for (u32 lit_id : lit_ids) {
+        assert(build.literals.right.at(lit_id).table == ROSE_ANCHORED);
+        max_offset = max(max_offset, findMaxOffset(build, lit_id));
+    }
+
+    if (max_offset <= bc.floatingMinLiteralMatchOffset) {
+        return;
+    }
+
+    auto ri = RoseInstruction(ROSE_INSTR_RECORD_ANCHORED);
+    ri.u.recordAnchored.id = final_id;
+    program.push_back(move(ri));
+}
+
+static
+u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 min_offset = UINT32_MAX;
+    for (const auto &v : lit_vertices) {
+        min_offset = min(min_offset, build.g[v].min_offset);
+    }
+
+    return min_offset;
+}
+
 static
 void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
                                   u32 final_id,
@@ -3536,22 +4538,36 @@ void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
         return;
     }
 
-    size_t min_offset = SIZE_MAX;
+    size_t min_len = SIZE_MAX;
+    u32 min_offset = UINT32_MAX;
     for (u32 lit_id : lit_ids) {
         const auto &lit = build.literals.right.at(lit_id);
-        min_offset = min(min_offset, lit.elength());
+        size_t lit_min_len = lit.elength();
+        u32 lit_min_offset = findMinOffset(build, lit_id);
+        DEBUG_PRINTF("lit_id=%u has min_len=%zu, min_offset=%u\n", lit_id,
+                     lit_min_len, lit_min_offset);
+        min_len = min(min_len, lit_min_len);
+        min_offset = min(min_offset, lit_min_offset);
     }
 
-    DEBUG_PRINTF("%zu lits, min_offset=%zu\n", lit_ids.size(), min_offset);
+    DEBUG_PRINTF("final_id=%u has min_len=%zu, min_offset=%u, "
+                 "global min is %u\n", final_id, min_len, min_offset,
+                 bc.floatingMinLiteralMatchOffset);
 
     // If we can't match before the min offset, we don't need the check.
-    if (min_offset >= bc.floatingMinLiteralMatchOffset) {
+    if (min_len >= bc.floatingMinLiteralMatchOffset) {
         DEBUG_PRINTF("no need for check, min is %u\n",
-                      bc.floatingMinLiteralMatchOffset);
+                     bc.floatingMinLiteralMatchOffset);
         return;
     }
 
-    program.push_back(RoseInstruction(ROSE_INSTR_CHECK_LIT_EARLY));
+    assert(min_offset >= bc.floatingMinLiteralMatchOffset);
+    assert(min_offset < UINT32_MAX);
+
+    DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
+    auto ri = RoseInstruction(ROSE_INSTR_CHECK_LIT_EARLY);
+    ri.u.checkLitEarly.min_offset = min_offset;
+    program.push_back(move(ri));
 }
 
 static
@@ -3601,8 +4617,9 @@ vector<RoseInstruction> buildLitInitialProgram(RoseBuildImpl &build,
 }
 
 static
-u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
-                        const vector<RoseEdge> &lit_edges) {
+vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
+                                            build_context &bc, u32 final_id,
+                                            const vector<RoseEdge> &lit_edges) {
     const auto &g = build.g;
 
     DEBUG_PRINTF("final id %u, %zu lit edges\n", final_id, lit_edges.size());
@@ -3621,6 +4638,9 @@ u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
         assert(contains(bc.roleStateIndices, u));
         u32 pred_state = bc.roleStateIndices.at(u);
         auto program = makeProgram(build, bc, e);
+        if (program.empty()) {
+            continue;
+        }
         predProgramLists[pred_state].push_back(program);
     }
 
@@ -3639,10 +4659,18 @@ u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
         root_programs.push_back(role_prog);
     }
 
-    // Literal may squash groups.
     if (final_id != MO_INVALID_IDX) {
-        root_programs.push_back({});
-        makeGroupSquashInstruction(build, final_id, root_programs.back());
+        vector<RoseInstruction> prog;
+
+        // Literal may squash groups.
+        makeGroupSquashInstruction(build, final_id, prog);
+
+        // Literal may be anchored and need to be recorded.
+        makeRecordAnchoredInstruction(build, bc, final_id, prog);
+
+        if (!prog.empty()) {
+            root_programs.push_back(move(prog));
+        }
     }
 
     vector<RoseInstruction> root_program;
@@ -3654,7 +4682,19 @@ u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
 
     // Put it all together.
     return makeSparseIterProgram(bc, predProgramLists, root_program,
-                                 pre_program).first;
+                                 pre_program);
+}
+
+static
+u32 writeLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
+                        const vector<RoseEdge> &lit_edges) {
+    auto program = buildLiteralProgram(build, bc, final_id, lit_edges);
+    if (program.empty()) {
+        return 0;
+    }
+    // Note: already flattened.
+    applyFinalSpecialisation(program);
+    return writeProgram(bc, program);
 }
 
 static
@@ -3720,53 +4760,84 @@ pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
     const u32 num_literals = build.final_id_to_literal.size();
     auto lit_edge_map = findEdgesByLiteral(build);
 
-    vector<u32> litPrograms(num_literals);
+    bc.litPrograms.resize(num_literals);
     vector<u32> delayRebuildPrograms(num_literals);
 
     for (u32 finalId = 0; finalId != num_literals; ++finalId) {
         const auto &lit_edges = lit_edge_map[finalId];
 
-        litPrograms[finalId] =
-            buildLiteralProgram(build, bc, finalId, lit_edges);
+        bc.litPrograms[finalId] =
+            writeLiteralProgram(build, bc, finalId, lit_edges);
         delayRebuildPrograms[finalId] =
             buildDelayRebuildProgram(build, bc, finalId);
     }
 
     u32 litProgramsOffset =
-        add_to_engine_blob(bc, begin(litPrograms), end(litPrograms));
+        add_to_engine_blob(bc, begin(bc.litPrograms), end(bc.litPrograms));
     u32 delayRebuildProgramsOffset = add_to_engine_blob(
         bc, begin(delayRebuildPrograms), end(delayRebuildPrograms));
 
     return {litProgramsOffset, delayRebuildProgramsOffset};
 }
 
+/**
+ * \brief Returns all reports used by output-exposed engines, for which we need
+ * to generate programs.
+ */
 static
-u32 buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
-    const auto &rm = build.rm;
-    const u32 numReports = verify_u32(rm.numReports());
-    vector<u32> programs(numReports);
+set<ReportID> findEngineReports(const RoseBuildImpl &build) {
+    set<ReportID> reports;
+
+    // The small write engine uses these engine report programs.
+    insert(&reports, build.smwr.all_reports());
+
+    for (const auto &outfix : build.outfixes) {
+        insert(&reports, all_reports(outfix));
+    }
+
+    const auto &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].suffix) {
+            insert(&reports, all_reports(g[v].suffix));
+        }
+    }
+
+    DEBUG_PRINTF("%zu engine reports (of %zu)\n", reports.size(),
+                 build.rm.numReports());
+    return reports;
+}
+
+static
+pair<u32, u32> buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
+    const auto reports = findEngineReports(build);
+    vector<u32> programs;
+    programs.reserve(reports.size());
 
     vector<RoseInstruction> program;
-    for (ReportID id = 0; id < numReports; id++) {
+    for (ReportID id : reports) {
         program.clear();
         const bool has_som = false;
         makeCatchupMpv(build, bc, id, program);
         makeReport(build, id, has_som, program);
         program = flattenProgram({program});
         applyFinalSpecialisation(program);
-        programs[id] = writeProgram(bc, program);
-        build.rm.setProgramOffset(id, programs[id]);
+        u32 offset = writeProgram(bc, program);
+        programs.push_back(offset);
+        build.rm.setProgramOffset(id, offset);
         DEBUG_PRINTF("program for report %u @ %u (%zu instructions)\n", id,
                      programs.back(), program.size());
     }
 
-    return add_to_engine_blob(bc, begin(programs), end(programs));
+    u32 offset = add_to_engine_blob(bc, begin(programs), end(programs));
+    u32 count = verify_u32(programs.size());
+    return {offset, count};
 }
 
 static
 vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
                                              build_context &bc,
-                                             const RoseEdge &e) {
+                                             const RoseEdge &e,
+                                             const bool multiple_preds) {
     const RoseGraph &g = build.g;
     const RoseVertex v = target(e, g);
 
@@ -3776,7 +4847,7 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
         makeRoleCheckBounds(build, v, e, program);
     }
 
-    if (hasGreaterInDegree(1, v, g)) {
+    if (multiple_preds) {
         // Only necessary when there is more than one pred.
         makeRoleCheckNotHandled(bc, v, program);
     }
@@ -3792,11 +4863,34 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
     return program;
 }
 
-/**
- * Returns the pair (program offset, sparse iter offset).
- */
 static
-pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
+bool hasEodAnchoredSuffix(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].suffix && build.isInETable(v)) {
+            DEBUG_PRINTF("vertex %zu is in eod table and has a suffix\n",
+                         g[v].idx);
+            return true;
+        }
+    }
+    return false;
+}
+
+static
+bool hasEodMatcher(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (build.isInETable(v)) {
+            DEBUG_PRINTF("vertex %zu is in eod table\n", g[v].idx);
+            return true;
+        }
+    }
+    return false;
+}
+
+static
+void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
+                         bool in_etable, vector<RoseInstruction> &program) {
     const RoseGraph &g = build.g;
 
     // pred state id -> list of programs
@@ -3810,37 +4904,50 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
                      in_degree(v, g));
 
+        vector<RoseEdge> edge_list;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
+            if (build.isInETable(u) != in_etable) {
+                DEBUG_PRINTF("pred %zu %s in etable\n", g[u].idx,
+                             in_etable ? "is not" : "is");
+                continue;
+            }
+            if (canEagerlyReportAtEod(build, e)) {
+                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
+                continue;
+            }
+            edge_list.push_back(e);
+        }
 
+        const bool multiple_preds = edge_list.size() > 1;
+        for (const auto &e : edge_list) {
+            RoseVertex u = source(e, g);
             assert(contains(bc.roleStateIndices, u));
             u32 predStateIdx = bc.roleStateIndices.at(u);
 
-            auto program = makeEodAnchorProgram(build, bc, e);
-            predProgramLists[predStateIdx].push_back(program);
+            auto prog = makeEodAnchorProgram(build, bc, e, multiple_preds);
+            if (prog.empty()) {
+                continue;
+            }
+            predProgramLists[predStateIdx].push_back(prog);
         }
     }
 
     if (predProgramLists.empty()) {
-        DEBUG_PRINTF("no eod anchored roles\n");
-        return {0, 0};
+        return;
     }
-
-    vector<RoseInstruction> program;
-
-    // Note: we force the use of a sparse iterator for the EOD program so we
-    // can easily guard EOD execution at runtime.
-    u32 iter_offset = addPredBlocks(bc, predProgramLists, program, true);
-
-    assert(program.size() > 1);
-    applyFinalSpecialisation(program);
-    return {writeProgram(bc, program), iter_offset};
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    addPredBlocks(bc, predProgramLists, program);
 }
 
 static
-u32 writeEodProgram(RoseBuildImpl &build, build_context &bc) {
+void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
+                        vector<RoseInstruction> &program) {
     if (build.eod_event_literal_id == MO_INVALID_IDX) {
-        return 0;
+        return;
     }
 
     const RoseGraph &g = build.g;
@@ -3864,7 +4971,80 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc) {
                     tie(g[source(b, g)].idx, g[target(b, g)].idx);
          });
 
-    return buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
+    auto prog = buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
+    program.insert(end(program), begin(prog), end(prog));
+}
+
+static
+void addEnginesEodProgram(u32 eodNfaIterOffset,
+                          vector<RoseInstruction> &program) {
+    if (!eodNfaIterOffset) {
+        return;
+    }
+
+    auto ri = RoseInstruction(ROSE_INSTR_ENGINES_EOD);
+    ri.u.enginesEod.iter_offset = eodNfaIterOffset;
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    program.push_back(move(ri));
+    program.emplace_back(ROSE_INSTR_END);
+}
+
+static
+void addSuffixesEodProgram(const RoseBuildImpl &build,
+                           vector<RoseInstruction> &program) {
+    if (!hasEodAnchoredSuffix(build)) {
+        return;
+    }
+
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
+    program.emplace_back(ROSE_INSTR_END);
+}
+
+static
+void addMatcherEodProgram(const RoseBuildImpl &build,
+                          vector<RoseInstruction> &program) {
+    if (!hasEodMatcher(build)) {
+        return;
+    }
+
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    program.emplace_back(ROSE_INSTR_MATCHER_EOD);
+    program.emplace_back(ROSE_INSTR_END);
+}
+
+static
+u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
+                    u32 eodNfaIterOffset) {
+    vector<RoseInstruction> program;
+
+    addEodEventProgram(build, bc, program);
+    addEnginesEodProgram(eodNfaIterOffset, program);
+    addEodAnchorProgram(build, bc, false, program);
+    addMatcherEodProgram(build, program);
+    addEodAnchorProgram(build, bc, true, program);
+    addSuffixesEodProgram(build, program);
+
+    if (program.size() == 1) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        return 0;
+    }
+
+    if (program.empty()) {
+        return 0;
+    }
+
+    applyFinalSpecialisation(program);
+    return writeProgram(bc, program);
 }
 
 static
@@ -3900,6 +5080,9 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
             u32 max_d = g[v].max_offset;
             u32 min_d = g[v].min_offset;
 
+            DEBUG_PRINTF("checking %u: elen %zu min/max %u/%u\n", lit_id,
+                         key.elength_including_mask(), min_d, max_d);
+
             if (build.literal_info[lit_id].undelayed_id != lit_id) {
                 /* this is a delayed match; need to update delay properties */
                 /* TODO: can delayed literals ever be in another table ? */
@@ -3919,9 +5102,9 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
             switch (key.table) {
             case ROSE_FLOATING:
                 ENSURE_AT_LEAST(&engine->floatingDistance, max_d);
-                if (min_d >= key.elength()) {
+                if (min_d >= key.elength_including_mask()) {
                     LIMIT_TO_AT_MOST(&engine->floatingMinDistance,
-                                     min_d - (u32)key.elength());
+                                     min_d - (u32)key.elength_including_mask());
                 } else {
                     /* overlapped literals from rose + anchored table can
                      * cause us to underflow due to sloppiness in
@@ -3965,6 +5148,60 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
     }
 }
 
+static
+u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
+                        u32 queue_count,
+                        build_context &bc) {
+    if (eager.empty()) {
+        return 0;
+    }
+
+    vector<u32> vec;
+    for (u32 q : eager) {
+        assert(q >= leftfixBeginQueue);
+        vec.push_back(q - leftfixBeginQueue);
+    }
+
+    vector<mmbit_sparse_iter> iter;
+    mmbBuildSparseIterator(iter, vec, queue_count - leftfixBeginQueue);
+    return addIteratorToTable(bc, iter);
+}
+
+static
+aligned_unique_ptr<RoseEngine> addSmallWriteEngine(RoseBuildImpl &build,
+                                        aligned_unique_ptr<RoseEngine> rose) {
+    assert(rose);
+
+    if (roseIsPureLiteral(rose.get())) {
+        DEBUG_PRINTF("pure literal case, not adding smwr\n");
+        return rose;
+    }
+
+    u32 qual = roseQuality(rose.get());
+    auto smwr_engine = build.smwr.build(qual);
+    if (!smwr_engine) {
+        DEBUG_PRINTF("no smwr built\n");
+        return rose;
+    }
+
+    const size_t mainSize = roseSize(rose.get());
+    const size_t smallWriteSize = smwrSize(smwr_engine.get());
+    DEBUG_PRINTF("adding smwr engine, size=%zu\n", smallWriteSize);
+
+    const size_t smwrOffset = ROUNDUP_CL(mainSize);
+    const size_t newSize = smwrOffset + smallWriteSize;
+
+    auto rose2 = aligned_zmalloc_unique<RoseEngine>(newSize);
+    char *ptr = (char *)rose2.get();
+    memcpy(ptr, rose.get(), mainSize);
+    memcpy(ptr + smwrOffset, smwr_engine.get(), smallWriteSize);
+
+    rose2->smallWriteOffset = verify_u32(smwrOffset);
+    rose2->size = verify_u32(newSize);
+
+    return rose2;
+}
+
 aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     DerivedBoundaryReports dboundary(boundary);
 
@@ -3981,10 +5218,15 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         bc.resources.has_anchored = true;
     }
     bc.needs_mpv_catchup = needsMpvCatchup(*this);
+    bc.vertex_group_map = getVertexGroupMap(*this);
+    bc.squashable_groups = getSquashableGroups(*this);
 
     auto boundary_out = makeBoundaryPrograms(*this, bc, boundary, dboundary);
 
-    u32 reportProgramOffset = buildReportPrograms(*this, bc);
+    u32 reportProgramOffset;
+    u32 reportProgramCount;
+    tie(reportProgramOffset, reportProgramCount) =
+        buildReportPrograms(*this, bc);
 
     // Build NFAs
     set<u32> no_retrigger_queues;
@@ -3997,7 +5239,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     u32 outfixEndQueue = qif.allocated_count();
     u32 leftfixBeginQueue = outfixEndQueue;
 
-    if (!buildNfas(*this, bc, qif, &no_retrigger_queues,
+    set<u32> eager_queues;
+
+    /* Note: buildNfas may reduce the lag for vertices that have prefixes */
+    if (!buildNfas(*this, bc, qif, &no_retrigger_queues, &eager_queues,
                    &leftfixBeginQueue)) {
         return nullptr;
     }
@@ -4017,7 +5262,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     u32 laggedRoseCount = 0;
     vector<LeftNfaInfo> leftInfoTable;
-    buildLeftInfoTable(*this, bc, leftfixBeginQueue,
+    buildLeftInfoTable(*this, bc, eager_queues, leftfixBeginQueue,
                        queue_count - leftfixBeginQueue, leftInfoTable,
                        &laggedRoseCount, &historyRequired);
 
@@ -4026,15 +5271,14 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     tie(litProgramOffset, litDelayRebuildProgramOffset) =
         buildLiteralPrograms(*this, bc);
 
-    u32 eodProgramOffset = writeEodProgram(*this, bc);
-    u32 eodIterProgramOffset;
-    u32 eodIterOffset;
-    tie(eodIterProgramOffset, eodIterOffset) = buildEodAnchorProgram(*this, bc);
+    u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
 
     vector<mmbit_sparse_iter> activeLeftIter;
     buildActiveLeftIter(leftInfoTable, activeLeftIter);
 
     u32 lastByteOffset = buildLastByteIter(g, bc);
+    u32 eagerIterOffset = buildEagerQueueIter(eager_queues, leftfixBeginQueue,
+                                              queue_count, bc);
 
     // Enforce role table resource limit.
     if (num_vertices(g) > cc.grey.limitRoseRoleCount) {
@@ -4057,7 +5301,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     // Build anchored matcher.
     size_t asize = 0;
     u32 amatcherOffset = 0;
-    auto atable = buildAnchoredMatcher(*this, anchored_dfas, &asize);
+    auto atable = buildAnchoredMatcher(*this, anchored_dfas, bc.litPrograms,
+                                       &asize);
     if (atable) {
         currOffset = ROUNDUP_CL(currOffset);
         amatcherOffset = currOffset;
@@ -4065,9 +5310,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     }
 
     // Build floating HWLM matcher.
+    rose_group fgroups = 0;
     size_t fsize = 0;
     size_t floatingStreamStateRequired = 0;
-    auto ftable = buildFloatingMatcher(*this, &fsize, &historyRequired,
+    auto ftable = buildFloatingMatcher(*this, &fgroups, &fsize, &historyRequired,
                                        &floatingStreamStateRequired);
     u32 fmatcherOffset = 0;
     if (ftable) {
@@ -4200,17 +5446,16 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->litProgramOffset = litProgramOffset;
     engine->litDelayRebuildProgramOffset = litDelayRebuildProgramOffset;
     engine->reportProgramOffset = reportProgramOffset;
-    engine->reportProgramCount = verify_u32(rm.reports().size());
+    engine->reportProgramCount = reportProgramCount;
     engine->runtimeImpl = pickRuntimeImpl(*this, bc, outfixEndQueue);
     engine->mpvTriggeredByLeaf = anyEndfixMpvTriggers(*this);
 
     engine->activeArrayCount = activeArrayCount;
     engine->activeLeftCount = activeLeftCount;
     engine->queueCount = queue_count;
+    engine->eagerIterOffset = eagerIterOffset;
     engine->handledKeyCount = bc.handledKeys.size();
 
-    engine->group_weak_end = group_weak_end;
-
     engine->rolesWithStateCount = bc.numStates;
 
     engine->leftOffset = leftOffset;
@@ -4226,9 +5471,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->nfaInfoOffset = nfaInfoOffset;
 
     engine->eodProgramOffset = eodProgramOffset;
-    engine->eodIterProgramOffset = eodIterProgramOffset;
-    engine->eodIterOffset = eodIterOffset;
-    engine->eodNfaIterOffset = eodNfaIterOffset;
 
     engine->lastByteHistoryIterOffset = lastByteOffset;
 
@@ -4282,6 +5524,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     fillMatcherDistances(*this, engine.get());
 
     engine->initialGroups = getInitialGroups();
+    engine->floating_group_mask = fgroups;
     engine->totalNumLiterals = verify_u32(literal_info.size());
     engine->asize = verify_u32(asize);
     engine->ematcherRegionSize = ematcher_region_size;
@@ -4315,6 +5558,9 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     // after we copied it into the engine bytecode.
     assert(byte_length(bc.engine_blob) == engineBlobSize);
 
+    // Add a small write engine if appropriate.
+    engine = addSmallWriteEngine(*this, move(engine));
+
     DEBUG_PRINTF("rose done %p\n", engine.get());
     return engine;
 }
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 12500599..3f82a9cc 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -34,6 +34,8 @@
 #include "rose_build_castle.h"
 #include "rose_build_convert.h"
 #include "rose_build_dump.h"
+#include "rose_build_groups.h"
+#include "rose_build_matchers.h"
 #include "rose_build_merge.h"
 #include "rose_build_role_aliasing.h"
 #include "rose_build_util.h"
@@ -68,7 +70,6 @@
 #include <algorithm>
 #include <functional>
 #include <map>
-#include <queue>
 #include <set>
 #include <string>
 #include <vector>
@@ -77,65 +78,16 @@
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
-using boost::adaptors::map_keys;
 using boost::adaptors::map_values;
 
 namespace ue2 {
 
-#define ROSE_LONG_LITERAL_LEN 8
-
 #define ANCHORED_REHOME_MIN_FLOATING 800
 #define ANCHORED_REHOME_MIN_FLOATING_SHORT 50
 #define ANCHORED_REHOME_ALLOW_SHORT 20
 #define ANCHORED_REHOME_DEEP 25
 #define ANCHORED_REHOME_SHORT_LEN 3
 
-static
-bool superStrong(const rose_literal_id &lit) {
-    if (lit.s.length() < ROSE_LONG_LITERAL_LEN) {
-        return false;
-    }
-
-    const u32 EXPECTED_FDR_BUCKET_LENGTH = 8;
-
-    assert(lit.s.length() >= EXPECTED_FDR_BUCKET_LENGTH);
-    size_t len = lit.s.length();
-    const string &s = lit.s.get_string();
-
-    for (size_t i = 1; i < EXPECTED_FDR_BUCKET_LENGTH; i++) {
-        if (s[len - 1 - i] != s[len - 1]) {
-            return true; /* we have at least some variation in the tail */
-        }
-    }
-    DEBUG_PRINTF("lit '%s' is not superstrong due to tail\n",
-                 escapeString(s).c_str());
-    return false;
-}
-
-rose_group RoseBuildImpl::getGroups(RoseVertex v) const {
-    rose_group groups = 0;
-
-    for (u32 id : g[v].literals) {
-        u32 lit_id = literal_info.at(id).undelayed_id;
-
-        rose_group mygroups = literal_info[lit_id].group_mask;
-        groups |= mygroups;
-    }
-
-    return groups;
-}
-
-/** \brief Get the groups of the successor literals of a given vertex. */
-rose_group RoseBuildImpl::getSuccGroups(RoseVertex start) const {
-    rose_group initialGroups = 0;
-
-    for (auto v : adjacent_vertices_range(start, g)) {
-        initialGroups |= getGroups(v);
-    }
-
-    return initialGroups;
-}
-
 #ifdef DEBUG
 static UNUSED
 void printLitInfo(const rose_literal_info &li, u32 id) {
@@ -481,6 +433,9 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
 
         // If the bounds are {0,0}, this role can only match precisely at EOD.
         if (minBound == 0 && maxBound == 0) {
+            /* last byte history will squash the state byte so cannot have other
+             * succ */
+            assert(out_degree(u, g) == 1);
             return ROSE_ROLE_HISTORY_LAST_BYTE;
         }
 
@@ -501,7 +456,8 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
         return ROSE_ROLE_HISTORY_NONE;
     }
 
-    if (g[u].fixedOffset()) {
+    if (g[u].fixedOffset() &&
+        (g[e].minBound || g[e].maxBound != ROSE_BOUND_INF)) {
         DEBUG_PRINTF("fixed offset -> anch\n");
         return ROSE_ROLE_HISTORY_ANCH;
     }
@@ -555,8 +511,8 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
         }
 
         // Use the program to handle cases that aren't external reports.
-        for (const ReportID &id : g[v].reports) {
-            if (!isExternalReport(rm.getReport(id))) {
+        for (const ReportID &rid : g[v].reports) {
+            if (!isExternalReport(rm.getReport(rid))) {
                 return false;
             }
         }
@@ -585,6 +541,45 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
     return true;
 }
 
+
+/* If we have prefixes that can squash all the floating roots, we can have a
+ * somewhat-conditional floating table. As we can't yet look at squash_masks, we
+ * have to make some guess as to if we are in this case but the win for not
+ * running a floating table over a large portion of the stream is significantly
+ * larger than avoiding running an eod table over the last N bytes. */
+static
+bool checkFloatingKillableByPrefixes(const RoseBuildImpl &tbi) {
+    for (auto v : vertices_range(tbi.g)) {
+        if (!tbi.isRootSuccessor(v)) {
+            continue;
+        }
+
+        if (!tbi.isFloating(v)) {
+            continue;
+        }
+
+        if (!tbi.g[v].left) {
+            DEBUG_PRINTF("unguarded floating root\n");
+            return false;
+        }
+
+        if (tbi.g[v].left.graph) {
+            const NGHolder &h = *tbi.g[v].left.graph;
+            if (proper_out_degree(h.startDs, h)) {
+                DEBUG_PRINTF("floating nfa prefix, won't die\n");
+                return false;
+            }
+        } else if (tbi.g[v].left.dfa) {
+            if (tbi.g[v].left.dfa->start_floating != DEAD_STATE) {
+                DEBUG_PRINTF("floating dfa prefix, won't die\n");
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 static
 bool checkEodStealFloating(const RoseBuildImpl &tbi,
                            const vector<u32> &eodLiteralsForFloating,
@@ -606,6 +601,11 @@ bool checkEodStealFloating(const RoseBuildImpl &tbi,
         return false;
     }
 
+    if (checkFloatingKillableByPrefixes(tbi)) {
+         DEBUG_PRINTF("skipping as prefixes may make ftable conditional\n");
+         return false;
+    }
+
     DEBUG_PRINTF("%zu are eod literals, %u floating; floating len=%zu\n",
                  eodLiteralsForFloating.size(), numFloatingLiterals,
                  shortestFloatingLen);
@@ -862,274 +862,6 @@ bool RoseBuildImpl::hasFinalId(u32 id) const {
     return literal_info.at(id).final_id != MO_INVALID_IDX;
 }
 
-static
-bool eligibleForAlwaysOnGroup(const RoseBuildImpl &tbi, u32 id) {
-    /* returns true if it or any of its delay versions have root role */
-    for (auto v : tbi.literal_info[id].vertices) {
-        if (tbi.isRootSuccessor(v)) {
-            NGHolder *h = tbi.g[v].left.graph.get();
-            if (!h || proper_out_degree(h->startDs, *h)) {
-                return true;
-            }
-        }
-    }
-
-    for (u32 delayed_id : tbi.literal_info[id].delayed_ids) {
-        for (auto v : tbi.literal_info[delayed_id].vertices) {
-            if (tbi.isRootSuccessor(v)) {
-                NGHolder *h = tbi.g[v].left.graph.get();
-                if (!h || proper_out_degree(h->startDs, *h)) {
-                    return true;
-                }
-            }
-        }
-    }
-
-    return false;
-}
-
-static
-bool requires_group_assignment(const rose_literal_id &lit,
-                               const rose_literal_info &info) {
-    if (lit.delay) { /* we will check the shadow's master */
-        return false;
-    }
-
-    if (lit.table == ROSE_ANCHORED || lit.table == ROSE_EVENT) {
-        return false;
-    }
-
-    // If we already have a group applied, skip.
-    if (info.group_mask) {
-        return false;
-    }
-
-    if (info.vertices.empty() && info.delayed_ids.empty()) {
-        DEBUG_PRINTF("literal is good for nothing\n");
-        return false;
-    }
-
-    return true;
-}
-
-static
-rose_group calcLocalGroup(const RoseVertex v, const RoseGraph &g,
-                          const deque<rose_literal_info> &literal_info,
-                          const bool small_literal_count) {
-    rose_group local_group = 0;
-
-    for (auto u : inv_adjacent_vertices_range(v, g)) {
-        /* In small cases, ensure that siblings have the same rose parentage to
-         * allow rose squashing. In larger cases, don't do this as groups are
-         * probably too scarce. */
-        for (auto w : adjacent_vertices_range(u, g)) {
-            if (!small_literal_count || g[v].left == g[w].left) {
-                for (u32 lit_id : g[w].literals) {
-                    local_group |= literal_info[lit_id].group_mask;
-                }
-            } else {
-                DEBUG_PRINTF("not sibling different mother %zu %zu\n",
-                             g[v].idx, g[w].idx);
-            }
-        }
-    }
-
-    return local_group;
-}
-
-/* group constants */
-#define MAX_LIGHT_LITERAL_CASE 200 /* allow rose to affect group decisions below
-                                    * this */
-
-static
-flat_set<RoseVertex> getAssociatedVertices(const RoseBuildImpl &build, u32 id) {
-    flat_set<RoseVertex> out;
-    const auto &info = build.literal_info[id];
-    insert(&out, info.vertices);
-    for (const auto &delayed : info.delayed_ids) {
-        insert(&out, build.literal_info[delayed].vertices);
-    }
-    return out;
-}
-
-static
-u32 next_available_group(u32 counter, u32 min_start_group) {
-    counter++;
-    if (counter == ROSE_GROUPS_MAX) {
-        DEBUG_PRINTF("resetting groups\n");
-        counter = min_start_group;
-    }
-
-    return counter;
-}
-
-// Assigns groups to literals in the general case, when we have more literals
-// than available groups.
-void RoseBuildImpl::assignGroupsToLiterals() {
-    bool small_literal_count = literal_info.size() <= MAX_LIGHT_LITERAL_CASE;
-
-    map<u8, u32> groupCount; /* group index to number of members */
-
-    u32 counter = 0;
-    u32 group_always_on = 0;
-
-    // First pass: handle always on literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-        rose_literal_info &info = literal_info[id];
-
-        if (!requires_group_assignment(lit, info)) {
-            continue;
-        }
-
-        // If this literal has a root role, we always have to search for it
-        // anyway, so it goes in the always-on group.
-        /* We could end up squashing it if it is followed by a .* */
-        if (eligibleForAlwaysOnGroup(*this, id)) {
-            info.group_mask = 1ULL << group_always_on;
-            groupCount[group_always_on]++;
-            continue;
-        }
-    }
-
-    u32 group_long_lit;
-    if (groupCount[group_always_on]) {
-        DEBUG_PRINTF("%u always on literals\n", groupCount[group_always_on]);
-        group_long_lit = group_always_on;
-        counter++;
-    } else {
-        group_long_lit = counter;
-        counter++;
-    }
-
-    u32 min_start_group = counter;
-    priority_queue<pair<pair<s32, s32>, u32> > pq;
-
-    // Second pass: the other literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-        rose_literal_info &info = literal_info[id];
-
-        if (!requires_group_assignment(lit, info)) {
-            continue;
-        }
-
-        assert(!eligibleForAlwaysOnGroup(*this, id));
-        pq.push(make_pair(make_pair(-(s32)literal_info[id].vertices.size(),
-                                    -(s32)lit.s.length()), id));
-    }
-
-    vector<u32> long_lits;
-    while (!pq.empty()) {
-        u32 id = pq.top().second;
-        pq.pop();
-        UNUSED const rose_literal_id &lit = literals.right.at(id);
-        DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,
-                     literal_info[id].vertices.size(), lit.s.length());
-
-        u8 group_id = 0;
-        rose_group group = ~0ULL;
-        for (auto v : getAssociatedVertices(*this, id)) {
-            rose_group local_group = calcLocalGroup(v, g, literal_info,
-                                                    small_literal_count);
-            group &= local_group;
-            if (!group) {
-                break;
-            }
-        }
-
-        if (group == ~0ULL) {
-            goto boring;
-        }
-
-        group &= ~((1ULL << min_start_group) - 1); /* ensure the purity of the
-                                                    * always_on groups */
-        if (!group) {
-            goto boring;
-        }
-
-        group_id = ctz64(group);
-
-        /* TODO: fairness */
-        DEBUG_PRINTF("picking sibling group %hhd\n", group_id);
-        literal_info[id].group_mask = 1ULL << group_id;
-        groupCount[group_id]++;
-
-        continue;
-
-    boring:
-        /* long literals will either be stuck in a mega group or spread around
-         * depending on availability */
-        if (superStrong(lit)) {
-            long_lits.push_back(id);
-            continue;
-        }
-
-        // Other literals are assigned to our remaining groups round-robin.
-        group_id = counter;
-
-        DEBUG_PRINTF("picking boring group %hhd\n", group_id);
-        literal_info[id].group_mask = 1ULL << group_id;
-        groupCount[group_id]++;
-        counter = next_available_group(counter, min_start_group);
-    }
-
-    /* spread long literals out amongst unused groups if any, otherwise stick
-     * them in the always on the group */
-
-    if (groupCount[counter]) {
-        DEBUG_PRINTF("sticking long literals in the image of the always on\n");
-        for (u32 lit_id : long_lits) {
-            literal_info[lit_id].group_mask = 1ULL << group_long_lit;
-            groupCount[group_long_lit]++;
-        }
-    } else {
-        u32 min_long_counter = counter;
-        DEBUG_PRINTF("base long lit group = %u\n", min_long_counter);
-        for (u32 lit_id : long_lits) {
-            u8 group_id = counter;
-            literal_info[lit_id].group_mask = 1ULL << group_id;
-            groupCount[group_id]++;
-            counter = next_available_group(counter, min_long_counter);
-        }
-    }
-
-    /* assign delayed literals to the same group as their parent */
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-
-        if (!lit.delay) {
-            continue;
-        }
-
-        u32 parent = literal_info[id].undelayed_id;
-        DEBUG_PRINTF("%u is shadow picking up groups from %u\n", id, parent);
-        assert(literal_info[parent].undelayed_id == parent);
-        assert(literal_info[parent].group_mask);
-        literal_info[id].group_mask = literal_info[parent].group_mask;
-        /* don't increment the group count - these don't really exist */
-    }
-
-    DEBUG_PRINTF("populate group to literal mapping\n");
-    for (const u32 id : literals.right | map_keys) {
-        rose_group groups = literal_info[id].group_mask;
-        while (groups) {
-            u32 group_id = findAndClearLSB_64(&groups);
-            group_to_literal[group_id].insert(id);
-        }
-    }
-
-    /* find how many groups we allocated */
-    for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) {
-        if (groupCount[i]) {
-            group_end = MAX(group_end, i + 1);
-        }
-    }
-}
-
 bool RoseBuildImpl::hasDelayedLiteral(RoseVertex v) const {
     for (u32 lit_id : g[v].literals) {
         if (literals.right.at(lit_id).delay) {
@@ -1160,213 +892,6 @@ bool RoseBuildImpl::hasAnchoredTablePred(RoseVertex v) const {
     return false;
 }
 
-/* returns true if every vertex associated with a groups also belongs to
-   lit_info */
-static
-bool coversGroup(const RoseBuildImpl &tbi, const rose_literal_info &lit_info) {
-    if (lit_info.vertices.empty()) {
-        DEBUG_PRINTF("no vertices - does not cover\n");
-        return false;
-    }
-
-    if (!lit_info.group_mask) {
-        DEBUG_PRINTF("no group - does not cover\n");
-        return false; /* no group (not a floating lit?) */
-    }
-
-    assert(popcount64(lit_info.group_mask) == 1);
-
-    /* for each lit in group, ensure that vertices are a subset of lit_info's */
-    rose_group groups = lit_info.group_mask;
-    while (groups) {
-        u32 group_id = findAndClearLSB_64(&groups);
-        for (u32 id : tbi.group_to_literal.at(group_id)) {
-            DEBUG_PRINTF(" checking against friend %u\n", id);
-            if (!is_subset_of(tbi.literal_info[id].vertices,
-                              lit_info.vertices)) {
-                DEBUG_PRINTF("fail\n");
-                return false;
-            }
-        }
-    }
-
-    DEBUG_PRINTF("ok\n");
-    return true;
-}
-
-static
-bool isGroupSquasher(const RoseBuildImpl &tbi, const u32 id /* literal id */,
-                     rose_group forbidden_squash_group) {
-    const RoseGraph &g = tbi.g;
-
-    const rose_literal_info &lit_info = tbi.literal_info.at(id);
-
-    DEBUG_PRINTF("checking if %u '%s' is a group squasher %016llx\n", id,
-                  dumpString(tbi.literals.right.at(id).s).c_str(),
-                  lit_info.group_mask);
-
-    if (tbi.literals.right.at(id).table == ROSE_EVENT) {
-        DEBUG_PRINTF("event literal, has no groups to squash\n");
-        return false;
-    }
-
-    if (!coversGroup(tbi, lit_info)) {
-        DEBUG_PRINTF("does not cover group\n");
-        return false;
-    }
-
-    if (lit_info.group_mask & forbidden_squash_group) {
-        /* probably a delayed lit */
-        DEBUG_PRINTF("skipping as involves a forbidden group\n");
-        return false;
-    }
-
-    // Single-vertex, less constrained case than the multiple-vertex one below.
-    if (lit_info.vertices.size() == 1) {
-        const RoseVertex &v = *lit_info.vertices.begin();
-
-        if (tbi.hasDelayPred(v)) { /* due to rebuild issues */
-            return false;
-        }
-
-        /* there are two ways to be a group squasher:
-         * 1) only care about the first accepted match
-         * 2) can only match once after a pred match
-         *
-         * (2) requires analysis of the infix before v and is not implemented,
-         * TODO
-         */
-
-        /* Case 1 */
-
-        // Can't squash cases with accepts
-        if (!g[v].reports.empty()) {
-            return false;
-        }
-
-        /* Can't squash cases with a suffix without analysis of the suffix.
-         * TODO: look at suffixes */
-        if (g[v].suffix) {
-            return false;
-        }
-
-        // Out-edges must have inf max bound, + no other shenanigans */
-        for (const auto &e : out_edges_range(v, g)) {
-            if (g[e].maxBound != ROSE_BOUND_INF) {
-                return false;
-            }
-
-            if (g[target(e, g)].left) {
-                return false; /* is an infix rose trigger, TODO: analysis */
-            }
-        }
-
-        DEBUG_PRINTF("%u is a path 1 group squasher\n", id);
-        return true;
-
-        /* note: we could also squash the groups of its preds (if nobody else is
-         * using them. TODO. */
-    }
-
-    // Multiple-vertex case
-    for (auto v : lit_info.vertices) {
-        assert(!tbi.isAnyStart(v));
-
-        // Can't squash cases with accepts
-        if (!g[v].reports.empty()) {
-            return false;
-        }
-
-        // Suffixes and leftfixes are out too as first literal may not match
-        // for everyone.
-        if (!g[v].isBoring()) {
-            return false;
-        }
-
-        /* TODO: checks are solid but we should explain */
-        if (tbi.hasDelayPred(v) || tbi.hasAnchoredTablePred(v)) {
-            return false;
-        }
-
-        // Out-edges must have inf max bound and not directly lead to another
-        // vertex with this group, e.g. 'foobar.*foobar'.
-        for (const auto &e : out_edges_range(v, g)) {
-            if (g[e].maxBound != ROSE_BOUND_INF) {
-                return false;
-            }
-            RoseVertex t = target(e, g);
-
-            if (g[t].left) {
-                return false; /* is an infix rose trigger */
-            }
-
-            for (u32 lit_id : g[t].literals) {
-                if (tbi.literal_info[lit_id].group_mask & lit_info.group_mask) {
-                    return false;
-                }
-            }
-        }
-
-        // In-edges must all be dot-stars with no overlap at all, as overlap
-        // also causes history to be used.
-        /* Different tables are already forbidden by previous checks */
-        for (const auto &e : in_edges_range(v, g)) {
-            if (!(g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF)) {
-                return false;
-            }
-
-            // Check overlap, if source was a literal.
-            RoseVertex u = source(e, g);
-            if (tbi.maxLiteralOverlap(u, v)) {
-                return false;
-            }
-        }
-    }
-
-    DEBUG_PRINTF("literal %u is a multi-vertex group squasher\n", id);
-    return true;
-}
-
-static
-void findGroupSquashers(RoseBuildImpl &tbi) {
-    rose_group forbidden_squash_group = 0;
-    for (const auto &e : tbi.literals.right) {
-        if (e.second.delay) {
-            forbidden_squash_group |= tbi.literal_info[e.first].group_mask;
-        }
-    }
-
-    for (u32 id = 0; id < tbi.literal_info.size(); id++) {
-        if (isGroupSquasher(tbi, id, forbidden_squash_group)) {
-            tbi.literal_info[id].squash_group = true;
-        }
-    }
-}
-
-/**
- * The groups that a role sets are determined by the union of its successor
- * literals. Requires the literals already have had groups assigned.
- */
-void RoseBuildImpl::assignGroupsToRoles() {
-    /* Note: if there is a succ literal in the sidematcher, its successors
-     * literals must be added instead */
-    for (auto v : vertices_range(g)) {
-        if (isAnyStart(v)) {
-            continue;
-        }
-
-        const rose_group succ_groups = getSuccGroups(v);
-        g[v].groups |= succ_groups;
-
-        if (ghost.find(v) != ghost.end()) {
-            /* delayed roles need to supply their groups to the ghost role */
-            g[ghost[v]].groups |= succ_groups;
-        }
-
-        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
-    }
-}
-
 void RoseBuildImpl::findTransientLeftfixes(void) {
     for (auto v : vertices_range(g)) {
         if (!g[v].left) {
@@ -1393,19 +918,32 @@ void RoseBuildImpl::findTransientLeftfixes(void) {
             continue;
         }
 
-        u32 his = g[v].left.lag + max_width;
+        if (cc.streaming) {
+            /* STREAMING: transient prefixes must be able to run using history
+             * rather than storing state. */
+            u32 his = g[v].left.lag + max_width;
 
-        // If this vertex has an event literal, we need to add one to cope
-        // with it.
-        if (hasLiteralInTable(v, ROSE_EVENT)) {
-            his++;
-        }
+            // If this vertex has an event literal, we need to add one to cope
+            // with it.
+            if (hasLiteralInTable(v, ROSE_EVENT)) {
+                his++;
+            }
 
-        /* +1 as trigger must appear in main buffer and no byte is needed to
-         * decompress the state */
-        if (his <= cc.grey.maxHistoryAvailable + 1) {
-            transient.insert(left);
-            DEBUG_PRINTF("a transient leftfix has been spotted his=%u\n", his);
+            /* +1 as trigger must appear in main buffer and no byte is needed to
+             * decompress the state */
+            if (his <= cc.grey.maxHistoryAvailable + 1) {
+                transient.insert(left);
+                DEBUG_PRINTF("a transient leftfix spotted his=%u\n", his);
+            }
+        } else {
+            /* BLOCK: transientness is less important and more fuzzy, ideally
+             * it should be quick to calculate the state. No need to worry about
+             * history (and hence lag). */
+            if (max_width < depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH)) {
+                transient.insert(left);
+                DEBUG_PRINTF("a transient block leftfix spotted [%u]\n",
+                             (u32)max_width);
+            }
         }
     }
 }
@@ -1718,7 +1256,8 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai,
         assert(old_id < tbi.literal_info.size());
         const rose_literal_info &li = tbi.literal_info[old_id];
 
-        // For compile determinism, operate over literal vertices in index order.
+        // For compile determinism, operate over literal vertices in index
+        // order.
         vector<RoseVertex> lit_verts(begin(li.vertices), end(li.vertices));
         sort(begin(lit_verts), end(lit_verts), VertexIndexComp(g));
 
@@ -1732,40 +1271,9 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai,
             g[v].max_offset = sai.max_bound + sai.literal.length();
             lit_info.vertices.insert(v);
 
-            assert(!g[v].reports.empty());
-
-            bool doDirectReports = true;
-            for (ReportID report_id : g[v].reports) {
-                const Report &old_rep = tbi.rm.getReport(report_id);
-                if (!isExternalReport(old_rep) || old_rep.hasBounds()) {
-                    doDirectReports = false;
-                    break;
-                }
-            }
-
-            if (doDirectReports) {
-                flat_set<ReportID> dr_reports;
-                for (ReportID report_id : g[v].reports) {
-                    // These new literal roles can be made direct reports, with
-                    // their bounds handled by the bounds on their Report
-                    // structures.
-                    Report rep(tbi.rm.getReport(report_id)); // copy
-                    assert(!rep.hasBounds());
-                    rep.minOffset = sai.literal.length() + sai.min_bound;
-                    rep.maxOffset = sai.literal.length() + sai.max_bound;
-                    dr_reports.insert(tbi.rm.getInternalId(rep));
-                }
-                g[v].reports = dr_reports;
-                RoseEdge e = add_edge(tbi.root, v, g).first;
-                g[e].minBound = 0;             // handled by internal_report
-                g[e].maxBound = ROSE_BOUND_INF; // handled by internal_report
-            } else {
-                // If we have a complex internal report, these must become
-                // anchored literals with their own roles.
-                RoseEdge e = add_edge(anchored_root, v, g).first;
-                g[e].minBound = sai.min_bound;
-                g[e].maxBound = sai.max_bound;
-            }
+            RoseEdge e = add_edge(anchored_root, v, g).first;
+            g[e].minBound = sai.min_bound;
+            g[e].maxBound = sai.max_bound;
         }
     }
 }
@@ -2181,8 +1689,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     assert(!danglingVertexRef(*this));
 
-    assignGroupsToLiterals();
-    assignGroupsToRoles();
+    findMoreLiteralMasks(*this);
+
+    assignGroupsToLiterals(*this);
+    assignGroupsToRoles(*this);
     findGroupSquashers(*this);
 
     /* final prep work */
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index f5e99c23..1578dda1 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -651,6 +651,26 @@ CharReach getReachOfNormalVertex(const NGHolder &g) {
     return CharReach();
 }
 
+/**
+ * \brief Set the edge bounds and appropriate history on the given edge in the
+ * Rose graph.
+ */
+static
+void setEdgeBounds(RoseGraph &g, const RoseEdge &e, u32 min_bound,
+                   u32 max_bound) {
+    assert(min_bound <= max_bound);
+    assert(max_bound <= ROSE_BOUND_INF);
+
+    g[e].minBound = min_bound;
+    g[e].maxBound = max_bound;
+
+    if (min_bound || max_bound < ROSE_BOUND_INF) {
+        g[e].history = ROSE_ROLE_HISTORY_ANCH;
+    } else {
+        g[e].history = ROSE_ROLE_HISTORY_NONE;
+    }
+}
+
 static
 bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
                              const RoseEdge &e_old, RoseVertex ar,
@@ -686,18 +706,13 @@ bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     if (source(e_old, g) == ar) {
         assert(g[e_old].minBound <= bound_min);
         assert(g[e_old].maxBound >= bound_max);
-        g[e_old].minBound = bound_min;
-        g[e_old].maxBound = bound_max;
-        g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+        setEdgeBounds(g, e_old, bound_min, bound_max);
     } else {
         RoseEdge e_new;
         UNUSED bool added;
         tie(e_new, added) = add_edge(ar, v, g);
         assert(added);
-        g[e_new].minBound = bound_min;
-        g[e_new].maxBound = bound_max;
-        g[e_new].history = ROSE_ROLE_HISTORY_ANCH;
-
+        setEdgeBounds(g, e_new, bound_min, bound_max);
         to_delete->push_back(e_old);
     }
 
@@ -751,9 +766,7 @@ bool handleStartDsPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
 
     /* update bounds on edge */
     assert(g[e].minBound <= repeatCount);
-    g[e].minBound = repeatCount;
-    g[e].maxBound = ROSE_BOUND_INF;
-    g[e].history = ROSE_ROLE_HISTORY_ANCH;
+    setEdgeBounds(g, e, repeatCount, ROSE_BOUND_INF);
 
     g[v].left.reset(); /* clear the prefix info */
 
@@ -893,26 +906,19 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
         }
 
         if (source(e_old, g) == ar) {
-            g[e_old].minBound = ri.repeatMin + width;
-            g[e_old].maxBound = ri.repeatMax + width;
-            g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+            setEdgeBounds(g, e_old, ri.repeatMin + width, ri.repeatMax + width);
         } else {
             RoseEdge e_new;
             UNUSED bool added;
             tie(e_new, added) = add_edge(ar, v, g);
             assert(added);
-            g[e_new].minBound = ri.repeatMin + width;
-            g[e_new].maxBound = ri.repeatMax + width;
-            g[e_new].history = ROSE_ROLE_HISTORY_ANCH;
-
+            setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width);
             to_delete->push_back(e_old);
         }
 
     } else {
         assert(g[e_old].minBound <= ri.repeatMin + width);
-        g[e_old].minBound = ri.repeatMin + width;
-        g[e_old].maxBound = ROSE_BOUND_INF;
-        g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+        setEdgeBounds(g, e_old, ri.repeatMin + width, ROSE_BOUND_INF);
     }
 
     g[v].left.dfa.reset();
@@ -1110,19 +1116,9 @@ void convertAnchPrefixToBounds(RoseBuildImpl &tbi) {
             bounds.min -= delay_adj;
         }
         bounds.max -= delay_adj;
-
-        g[e].minBound = bounds.min;
-        g[e].maxBound =
-            bounds.max.is_finite() ? (u32)bounds.max : ROSE_BOUND_INF;
-
-        // It's possible that a (0,inf) case might sneak through here, in which
-        // case we don't need ANCH history at all.
-        if (g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF) {
-            g[e].history = ROSE_ROLE_HISTORY_NONE;
-        } else {
-            g[e].history = ROSE_ROLE_HISTORY_ANCH;
-        }
-
+        setEdgeBounds(g, e, bounds.min, bounds.max.is_finite()
+                                            ? (u32)bounds.max
+                                            : ROSE_BOUND_INF);
         g[v].left.reset();
     }
 }
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 079dd556..5fb27c55 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -30,12 +30,13 @@
 
 #include "rose_build_dump.h"
 
-#include "hwlm/hwlm_build.h"
 #include "rose_build_impl.h"
 #include "rose_build_matchers.h"
 #include "rose/rose_dump.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
+#include "nfa/castlecompile.h"
 #include "nfa/nfa_internal.h"
 #include "nfagraph/ng_dump.h"
 #include "som/slot_manager_dump.h"
@@ -60,24 +61,22 @@ using namespace std;
 
 namespace ue2 {
 
-static
-string to_string(nfa_kind k) {
-    switch (k) {
-    case NFA_PREFIX:
-        return "p";
-    case NFA_INFIX:
-        return "i";
-    case NFA_SUFFIX:
-        return "s";
-    case NFA_OUTFIX:
-        return "o";
-    case NFA_REV_PREFIX:
-        return "r";
-    case NFA_OUTFIX_RAW:
-        return "O";
+/** \brief Return the kind of a left_id or a suffix_id. */
+template<class Graph>
+string render_kind(const Graph &g) {
+    if (g.graph()) {
+        return to_string(g.graph()->kind);
     }
-    assert(0);
-    return "?";
+    if (g.dfa()) {
+        return to_string(g.dfa()->kind);
+    }
+    if (g.haig()) {
+        return to_string(g.haig()->kind);
+    }
+    if (g.castle()) {
+        return to_string(g.castle()->kind);
+    }
+    return "UNKNOWN";
 }
 
 namespace {
@@ -130,22 +129,12 @@ public:
         }
 
         if (g[v].suffix) {
-            os << "\\nSUFFIX (TOP " << g[v].suffix.top;
-            // Can't dump the queue number, but we can identify the suffix.
-            if (g[v].suffix.graph) {
-                os << ", graph=" << g[v].suffix.graph.get() << " "
-                   << to_string(g[v].suffix.graph->kind);
+            suffix_id suff(g[v].suffix);
+            os << "\\n" << render_kind(suff) << " (top " << g[v].suffix.top;
+            auto it = build.suffix_queue_map.find(suff);
+            if (it != end(build.suffix_queue_map)) {
+                os << ", queue " << it->second;
             }
-            if (g[v].suffix.castle) {
-                os << ", castle=" << g[v].suffix.castle.get();
-            }
-            if (g[v].suffix.rdfa) {
-                os << ", dfa=" << g[v].suffix.rdfa.get();
-            }
-            if (g[v].suffix.haig) {
-                os << ", haig=" << g[v].suffix.haig.get();
-            }
-
             os << ")";
         }
 
@@ -154,15 +143,15 @@ public:
         }
 
         if (g[v].left) {
-            const char *roseKind =
-                build.isRootSuccessor(v) ? "PREFIX" : "INFIX";
-            os << "\\nROSE " << roseKind;
-            os << " (";
-            os << "report " << g[v].left.leftfix_report << ")";
-
-            if (g[v].left.graph) {
-                os << " " << to_string(g[v].left.graph->kind);
+            left_id left(g[v].left);
+            os << "\\n" << render_kind(left) << " (queue ";
+            auto it = build.leftfix_queue_map.find(left);
+            if (it != end(build.leftfix_queue_map)) {
+                os << it->second;
+            } else {
+                os << "??";
             }
+            os << ", report " << g[v].left.leftfix_report << ")";
         }
 
         os << "\"";
@@ -262,14 +251,18 @@ void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
     const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl &>(build_base);
 
     const Grey &grey = build.cc.grey;
-    if (!grey.dumpFlags) {
+
+    /* "early" rose graphs should only be dumped if we are dumping intermediate
+     * graphs. Early graphs can be identified by the lack of a RoseEngine. */
+    u32 flag_test = t ? Grey::DUMP_IMPL : Grey::DUMP_INT_GRAPH;
+
+    if (!(grey.dumpFlags & flag_test)) {
         return;
     }
 
     stringstream ss;
     ss << grey.dumpPath << filename;
 
-
     DEBUG_PRINTF("dumping graph to %s\n", ss.str().c_str());
     ofstream os(ss.str());
 
@@ -447,18 +440,6 @@ void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
     of.close();
 }
 
-namespace {
-struct LongerThanLimit {
-    explicit LongerThanLimit(size_t len) : max_len(len) {}
-    bool operator()(const hwlmLiteral &lit) const {
-        return lit.s.length() > max_len;
-    }
-
-  private:
-    size_t max_len;
-};
-}
-
 static
 void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
     auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED);
@@ -470,13 +451,14 @@ void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
     lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
     dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
 
-    lits = fillHamsterLiteralList(build, ROSE_FLOATING);
-    auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
-    lits.insert(end(lits), begin(lits2), end(lits2));
-    lits.erase(remove_if(lits.begin(), lits.end(),
-                         LongerThanLimit(ROSE_SMALL_BLOCK_LEN)),
-               lits.end());
-    dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
+    if (!build.cc.streaming) {
+        lits = fillHamsterLiteralList(build, ROSE_FLOATING,
+                                      ROSE_SMALL_BLOCK_LEN);
+        auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
+                                            ROSE_SMALL_BLOCK_LEN);
+        lits.insert(end(lits), begin(lits2), end(lits2));
+        dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
+    }
 }
 
 void dumpRose(const RoseBuild &build_base, const RoseEngine *t,
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
new file mode 100644
index 00000000..c9e8d215
--- /dev/null
+++ b/src/rose/rose_build_exclusive.cpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+
+#include "rose_build_exclusive.h"
+#include "rose_build_merge.h"
+#include "nfa/castlecompile.h"
+#include "nfagraph/ng_execute.h"
+#include "nfagraph/ng_holder.h"
+#include "nfagraph/ng_util.h"
+#include "util/clique.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/make_unique.h"
+
+using namespace std;
+
+namespace ue2 {
+
+template<typename role_id>
+struct RoleChunk {
+    vector<RoleInfo<role_id>> roles;
+};
+
+static
+CharReach getReachability(const NGHolder &h) {
+    CharReach cr;
+    for (const auto &v : vertices_range(h)) {
+        if (!is_special(v, h)) {
+            cr |= h[v].char_reach;
+        }
+    }
+    return cr;
+}
+
+template<typename role_id>
+static
+vector<RoleChunk<role_id>> divideIntoChunks(const RoseBuildImpl &build,
+                                 set<RoleInfo<role_id>> &roleInfoSet) {
+    u32 chunkSize = build.cc.grey.tamaChunkSize;
+    u32 cnt = 1;
+    vector<RoleChunk<role_id>> chunks;
+    RoleChunk<role_id> roleChunk;
+    for (const auto &roleInfo : roleInfoSet) {
+        if (cnt == chunkSize) {
+            cnt -= chunkSize;
+            chunks.push_back(roleChunk);
+            roleChunk.roles.clear();
+        }
+        roleChunk.roles.push_back(roleInfo);
+        cnt++;
+    }
+
+    if (cnt > 1) {
+        chunks.push_back(roleChunk);
+    }
+
+    return chunks;
+}
+
+/* add prefix literals to engine graph */
+static
+bool addPrefixLiterals(NGHolder &h, ue2::unordered_set<u32> &tailId,
+                       const vector<vector<CharReach>> &triggers) {
+    DEBUG_PRINTF("add literals to graph\n");
+
+    NFAVertex start = h.start;
+    vector<NFAVertex> heads;
+    vector<NFAVertex> tails;
+    for (const auto &lit : triggers) {
+        NFAVertex last = start;
+        if (lit.empty()) {
+            return false;
+        }
+        u32 i = 0;
+        for (const auto &c : lit) {
+            DEBUG_PRINTF("lit:%s \n", c.to_string().c_str());
+            NFAVertex u = add_vertex(h);
+            h[u].char_reach = c;
+            if (!i++) {
+                heads.push_back(u);
+                last = u;
+                continue;
+            }
+            add_edge(last, u, h);
+            last = u;
+        }
+        tails.push_back(last);
+        tailId.insert(h[last].index);
+    }
+
+    for (auto v : adjacent_vertices_range(start, h)) {
+        if (v != h.startDs) {
+            for (auto &t : tails) {
+                add_edge(t, v, h);
+            }
+        }
+    }
+
+    clear_out_edges(start, h);
+    add_edge(h.start, h.start, h);
+    for (auto &t : heads) {
+        add_edge(start, t, h);
+    }
+
+    DEBUG_PRINTF("literals addition done\n");
+    return true;
+}
+
+/* check if one literal is suffix of another */
+static
+bool isSuffix(const vector<vector<CharReach>> &triggers1,
+              const vector<vector<CharReach>> &triggers2) {
+    // literal suffix test
+    for (const auto &lit1 : triggers1) {
+        for (const auto &lit2 : triggers2) {
+            const size_t len = min(lit1.size(), lit2.size());
+            if (equal(lit1.rbegin(), lit1.rbegin() + len,
+                      lit2.rbegin(), overlaps)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/* prepare initial infix or suffix graph used for exclusive analysis */
+template<typename role_id>
+static
+u32 prepareRoleGraph(NGHolder &h, const role_id &s1) {
+    u32 num = 0;
+    if (s1.castle()) {
+        num = num_vertices(h);
+        NFAVertex u = add_vertex(h);
+        h[u].char_reach = s1.castle()->reach();
+        add_edge(h.startDs, u, h);
+        // add self loop to repeat characters
+        add_edge(u, u, h);
+    } else if (s1.graph()) {
+        const NGHolder &g = *s1.graph();
+        cloneHolder(h, g);
+        num = num_vertices(h);
+    } else {
+        // only infixes and suffixes with graph properties are possible
+        // candidates, already filtered out other cases before
+        // exclusive analysis
+        assert(0);
+    }
+
+    return num;
+}
+
+/* get a subset of literal if reset character is found */
+static
+vector<CharReach> findStartPos(const CharReach &cr1,
+                               const vector<CharReach> &lit) {
+    auto it = lit.rbegin(), ite = lit.rend();
+    u32 pos = lit.size();
+    for (; it != ite; it++) {
+        if (!overlaps(cr1, *it)) {
+            break;
+        }
+        pos--;
+    }
+
+    return vector<CharReach> (lit.begin() + pos, lit.end());
+}
+
+template<typename role_id>
+static
+bool isExclusive(const NGHolder &h,
+                 const u32 num, ue2::unordered_set<u32> &tailId,
+                 map<u32, ue2::unordered_set<u32>> &skipList,
+                 const RoleInfo<role_id> &role1,
+                 const RoleInfo<role_id> &role2) {
+    const u32 id1 = role1.id;
+    const u32 id2 = role2.id;
+
+    if (contains(skipList, id1) && contains(skipList[id1], id2)) {
+        return false;
+    }
+
+    const auto &triggers1 = role1.literals;
+    const auto &triggers2 = role2.literals;
+    if (isSuffix(triggers1, triggers2)) {
+        skipList[id2].insert(id1);
+        return false;
+    }
+
+    DEBUG_PRINTF("role id2:%u\n", id2);
+    const auto &cr1 = role1.cr;
+    if (overlaps(cr1, role2.last_cr)) {
+        CharReach cr = cr1 | role1.prefix_cr;
+        for (const auto &lit : triggers2) {
+            auto lit1 = findStartPos(cr, lit);
+            if (lit1.empty()) {
+                continue;
+            }
+            u32 lower_bound = 0;
+            if (lit1.size() < lit.size()) {
+                lower_bound = ~0U;
+            }
+
+            ue2::flat_set<NFAVertex> states;
+            for (const auto &v : vertices_range(h)) {
+                if (h[v].index >= lower_bound || h[v].index < 2) {
+                    states.insert(v);
+                }
+            }
+
+            auto activeStates = execute_graph(h, lit1, states);
+            // Check if has only literal states are on
+            for (const auto &s : activeStates) {
+                u32 stateId = h[s].index;
+                if ((stateId > 1 && stateId <= num) ||
+                    contains(tailId, stateId)) {
+                    skipList[id2].insert(id1);
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+template<typename role_id>
+static
+ue2::unordered_set<u32> checkExclusivity(const NGHolder &h,
+                            const u32 num, ue2::unordered_set<u32> &tailId,
+                            map<u32, ue2::unordered_set<u32>> &skipList,
+                            const RoleInfo<role_id> &role1,
+                            const RoleChunk<role_id> &roleChunk) {
+    ue2::unordered_set<u32> info;
+    const u32 id1 = role1.id;
+    for (const auto &role2 : roleChunk.roles) {
+        const u32 id2 = role2.id;
+        if (id1 != id2 && isExclusive(h, num, tailId, skipList,
+                                      role1, role2)) {
+            info.insert(id2);
+        }
+    }
+
+    return info;
+}
+
+static
+void findCliques(const map<u32, set<u32>> &exclusiveGroups,
+                 vector<vector<u32>> &exclusive_roles) {
+    if (exclusiveGroups.empty()) {
+        return;
+    }
+    // Construct the exclusivity graph
+    map<u32, CliqueVertex> vertex_map;
+    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+
+    // Add vertices representing infixes/suffixes
+    for (const auto &e : exclusiveGroups) {
+        const u32 id = e.first;
+        CliqueVertex v1 = add_vertex(CliqueVertexProps(id), *cg);
+        vertex_map[id] = v1;
+    }
+
+    // Wire exclusive pairs
+    for (const auto &e1 : exclusiveGroups) {
+        const u32 literalId1 = e1.first;
+        CliqueVertex lv = vertex_map[literalId1];
+        const set<u32> &exclusiveSet = e1.second;
+        for (const auto &e2 : exclusiveGroups) {
+            const u32 literalId2 = e2.first;
+            if (literalId1 < literalId2 &&
+                contains(exclusiveSet, literalId2)) {
+                add_edge(lv, vertex_map[literalId2], *cg);
+                DEBUG_PRINTF("Wire %u:%u\n", literalId1, literalId2);
+            }
+        }
+    }
+
+    // Find clique groups
+    const auto &clique = removeClique(*cg);
+    for (const auto &i : clique) {
+        DEBUG_PRINTF("cliq:%lu\n", i.size());
+        if (i.size() > 1) {
+            exclusive_roles.push_back(i);
+        }
+    }
+    DEBUG_PRINTF("Clique graph size:%lu\n", exclusive_roles.size());
+}
+
+static
+map<u32, set<u32>> findExclusiveGroups(const RoseBuildImpl &build,
+            const map<u32, ue2::unordered_set<u32>> &exclusiveInfo,
+            const map<u32, vector<RoseVertex>> &vertex_map,
+            const bool is_infix) {
+    map<u32, set<u32>> exclusiveGroups;
+    for (const auto &e : exclusiveInfo) {
+        u32 i = e.first;
+        const auto &s = e.second;
+        set<u32> group;
+        set<RoseVertex> q1(vertex_map.at(i).begin(),
+                           vertex_map.at(i).end());
+        DEBUG_PRINTF("vertex set:%lu\n", q1.size());
+        for (const auto &val : s) {
+            set<RoseVertex> q2(vertex_map.at(val).begin(),
+                               vertex_map.at(val).end());
+            if (contains(exclusiveInfo.at(val), i) &&
+                (!is_infix || mergeableRoseVertices(build, q1, q2))) {
+                group.insert(val);
+            }
+        }
+        if (!group.empty()) {
+            exclusiveGroups[i] = group;
+        }
+    }
+
+    return exclusiveGroups;
+}
+
+template<typename role_id>
+static
+bool setTriggerLiterals(RoleInfo<role_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    u32 minLiteralLen = ~0U;
+    for (const auto &tr : triggers) {
+        for (const auto &lit : tr.second) {
+            if (lit.empty()) {
+                return false;
+            }
+            minLiteralLen = min(minLiteralLen, (u32)lit.size());
+            roleInfo.last_cr |= lit.back();
+            for (const auto &c : lit) {
+                roleInfo.prefix_cr |= c;
+            }
+            roleInfo.literals.push_back(lit);
+        }
+    }
+
+    if (roleInfo.role.graph()) {
+        const NGHolder &g = *roleInfo.role.graph();
+        roleInfo.cr = getReachability(g);
+    } else if (roleInfo.role.castle()) {
+        roleInfo.cr = roleInfo.role.castle()->reach();
+    }
+
+    // test the score of this engine
+    roleInfo.score = 256 - roleInfo.cr.count() + minLiteralLen;
+    if (roleInfo.score < 20) {
+        return false;
+    }
+
+    return true;
+}
+
+bool setTriggerLiteralsInfix(RoleInfo<left_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    return setTriggerLiterals(roleInfo, triggers);
+}
+
+bool setTriggerLiteralsSuffix(RoleInfo<suffix_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    return setTriggerLiterals(roleInfo, triggers);
+}
+
+template<typename role_id>
+static
+void exclusiveAnalysis(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<role_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles, const bool is_infix) {
+    const auto &chunks = divideIntoChunks(build, roleInfoSet);
+    DEBUG_PRINTF("Exclusivity analysis entry\n");
+    map<u32, ue2::unordered_set<u32>> exclusiveInfo;
+
+    for (const auto &roleChunk : chunks) {
+        map<u32, ue2::unordered_set<u32>> skipList;
+        for (const auto &role1 : roleChunk.roles) {
+            const u32 id1 = role1.id;
+            const role_id &s1 = role1.role;
+            const auto &triggers1 = role1.literals;
+
+            NGHolder h;
+            u32 num = prepareRoleGraph(h, s1);
+            DEBUG_PRINTF("role id1:%u\n", id1);
+            unordered_set<u32> tailId;
+            if (!addPrefixLiterals(h, tailId, triggers1)) {
+                continue;
+            }
+
+            exclusiveInfo[id1] = checkExclusivity(h, num, tailId,
+                                             skipList, role1, roleChunk);
+        }
+    }
+
+    // Create final candidate exclusive groups
+    const auto exclusiveGroups =
+        findExclusiveGroups(build, exclusiveInfo, vertex_map, is_infix);
+    exclusiveInfo.clear();
+
+    // Find cliques for each exclusive groups
+    findCliques(exclusiveGroups, exclusive_roles);
+}
+
+void exclusiveAnalysisInfix(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<left_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles) {
+    exclusiveAnalysis(build, vertex_map, roleInfoSet, exclusive_roles,
+                      true);
+}
+
+void exclusiveAnalysisSuffix(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<suffix_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles) {
+    exclusiveAnalysis(build, vertex_map, roleInfoSet, exclusive_roles,
+                      false);
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_exclusive.h b/src/rose/rose_build_exclusive.h
new file mode 100644
index 00000000..9cabb1d2
--- /dev/null
+++ b/src/rose/rose_build_exclusive.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief exclusive analysis for infix and suffix engines.
+ * Two engines are considered as exclusive if they can never be alive
+ * at the same time. This analysis takes advantage of the property of
+ * triggering literal + engine graph. If the triggering literals of
+ * two engines can make all the states dead in each other's graph,
+ * then they are exclusive.
+ */
+#ifndef ROSE_BUILD_EXCLUSIVE_H
+#define ROSE_BUILD_EXCLUSIVE_H
+
+#include "ue2common.h"
+
+#include "rose_build_impl.h"
+#include "util/alloc.h"
+#include "util/charreach.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace ue2 {
+
+/** brief subengine info including built engine and
+ * corresponding triggering rose vertices */
+struct ExclusiveSubengine {
+    aligned_unique_ptr<NFA> nfa;
+    std::vector<RoseVertex> vertices;
+};
+
+/** \brief exclusive info to build tamarama */
+struct ExclusiveInfo {
+    // subengine info
+    std::vector<ExclusiveSubengine> subengines;
+    // all the report in tamarama
+    std::set<ReportID> reports;
+    // assigned queue id
+    u32 queue;
+};
+
+/** \brief role info structure for exclusive analysis */
+template<typename role_id>
+struct RoleInfo {
+    RoleInfo(role_id role_in, u32 id_in) : role(role_in), id(id_in) {}
+    bool operator==(const RoleInfo &b) const {
+        return id == b.id;
+    }
+    bool operator!=(const RoleInfo &b) const { return !(*this == b); }
+    bool operator<(const RoleInfo &b) const {
+        const RoleInfo &a = *this;
+        if (a.score != b.score) {
+            return a.score > b.score;
+        }
+        ORDER_CHECK(id);
+        return false;
+    }
+
+    std::vector<std::vector<CharReach>> literals; // prefix literals
+    CharReach prefix_cr; // reach of prefix literals
+    CharReach last_cr; // reach of the last character of literals
+    CharReach cr; // reach of engine graph
+    const role_id role; // infix or suffix info
+    const u32 id; // infix or suffix id
+    u32 score = ~0U; // score for exclusive analysis
+};
+
+/**
+ * \brief add triggering literals to infix info.
+ */
+bool setTriggerLiteralsInfix(RoleInfo<left_id> &roleInfo,
+        const std::map<u32, std::vector<std::vector<CharReach>>> &triggers);
+
+/**
+ * \brief add triggering literals to suffix info.
+ */
+bool setTriggerLiteralsSuffix(RoleInfo<suffix_id> &roleInfo,
+        const std::map<u32, std::vector<std::vector<CharReach>>> &triggers);
+
+/**
+ * Exclusive analysis for infix engines.
+ *
+ * @param build rose build info mainly used to set exclusive chunk size here
+ * @param vertex_map mapping between engine id and rose vertices
+ *        related to this engine
+ * @param roleInfoSet structure contains role properties including infix info,
+ *        triggering literals and literal reachabilities.
+ *        Used for exclusive analysis.
+ * @param exclusive_roles output mapping between engine id and its exclusive
+ *        group id
+ */
+void exclusiveAnalysisInfix(const RoseBuildImpl &build,
+               const std::map<u32, std::vector<RoseVertex>> &vertex_map,
+               std::set<RoleInfo<left_id>> &roleInfoSet,
+               std::vector<std::vector<u32>> &exclusive_roles);
+
+/**
+ * Exclusive analysis for suffix engines.
+ *
+ * @param build rose build info mainly used to set exclusive chunk size here
+ * @param vertex_map mapping between engine id and rose vertices
+ *        related to this engine
+ * @param roleInfoSet structure contains role properties including suffix info,
+ *        triggering literals and literal reachabilities.
+ *        Used for exclusive analysis.
+ * @param exclusive_roles output mapping between engine id and its exclusive
+ *        group id
+ */
+void exclusiveAnalysisSuffix(const RoseBuildImpl &build,
+               const std::map<u32, std::vector<RoseVertex>> &vertex_map,
+               std::set<RoleInfo<suffix_id>> &roleInfoSet,
+               std::vector<std::vector<u32>> &exclusive_roles);
+
+} // namespace ue2
+
+#endif //ROSE_BUILD_EXCLUSIVE_H
+
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
new file mode 100644
index 00000000..5e477e3b
--- /dev/null
+++ b/src/rose/rose_build_groups.cpp
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for analysing literal groups.
+ */
+
+#include "rose_build_groups.h"
+
+#include <queue>
+#include <vector>
+
+#include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+using namespace std;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+#define ROSE_LONG_LITERAL_LEN 8
+
+static
+bool superStrong(const rose_literal_id &lit) {
+    if (lit.s.length() < ROSE_LONG_LITERAL_LEN) {
+        return false;
+    }
+
+    const u32 EXPECTED_FDR_BUCKET_LENGTH = 8;
+
+    assert(lit.s.length() >= EXPECTED_FDR_BUCKET_LENGTH);
+    size_t len = lit.s.length();
+    const string &s = lit.s.get_string();
+
+    for (size_t i = 1; i < EXPECTED_FDR_BUCKET_LENGTH; i++) {
+        if (s[len - 1 - i] != s[len - 1]) {
+            return true; /* we have at least some variation in the tail */
+        }
+    }
+    DEBUG_PRINTF("lit '%s' is not superstrong due to tail\n",
+                 escapeString(s).c_str());
+    return false;
+}
+
+static
+bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
+    /* returns true if it or any of its delay versions have root role */
+    for (auto v : build.literal_info[id].vertices) {
+        if (build.isRootSuccessor(v)) {
+            NGHolder *h = build.g[v].left.graph.get();
+            if (!h || proper_out_degree(h->startDs, *h)) {
+                return true;
+            }
+        }
+    }
+
+    for (u32 delayed_id : build.literal_info[id].delayed_ids) {
+        for (auto v : build.literal_info[delayed_id].vertices) {
+            if (build.isRootSuccessor(v)) {
+                NGHolder *h = build.g[v].left.graph.get();
+                if (!h || proper_out_degree(h->startDs, *h)) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    return false;
+}
+
+static
+bool requires_group_assignment(const rose_literal_id &lit,
+                               const rose_literal_info &info) {
+    if (lit.delay) { /* we will check the shadow's master */
+        return false;
+    }
+
+    if (lit.table == ROSE_ANCHORED || lit.table == ROSE_EVENT) {
+        return false;
+    }
+
+    // If we already have a group applied, skip.
+    if (info.group_mask) {
+        return false;
+    }
+
+    if (info.vertices.empty() && info.delayed_ids.empty()) {
+        DEBUG_PRINTF("literal is good for nothing\n");
+        return false;
+    }
+
+    return true;
+}
+
+static
+rose_group calcLocalGroup(const RoseVertex v, const RoseGraph &g,
+                          const deque<rose_literal_info> &literal_info,
+                          const bool small_literal_count) {
+    rose_group local_group = 0;
+
+    for (auto u : inv_adjacent_vertices_range(v, g)) {
+        /* In small cases, ensure that siblings have the same rose parentage to
+         * allow rose squashing. In larger cases, don't do this as groups are
+         * probably too scarce. */
+        for (auto w : adjacent_vertices_range(u, g)) {
+            if (!small_literal_count || g[v].left == g[w].left) {
+                for (u32 lit_id : g[w].literals) {
+                    local_group |= literal_info[lit_id].group_mask;
+                }
+            } else {
+                DEBUG_PRINTF("not sibling different mother %zu %zu\n",
+                             g[v].idx, g[w].idx);
+            }
+        }
+    }
+
+    return local_group;
+}
+
+/* group constants */
+#define MAX_LIGHT_LITERAL_CASE 200 /* allow rose to affect group decisions below
+                                    * this */
+
+static
+flat_set<RoseVertex> getAssociatedVertices(const RoseBuildImpl &build, u32 id) {
+    flat_set<RoseVertex> out;
+    const auto &info = build.literal_info[id];
+    insert(&out, info.vertices);
+    for (const auto &delayed : info.delayed_ids) {
+        insert(&out, build.literal_info[delayed].vertices);
+    }
+    return out;
+}
+
+static
+u32 next_available_group(u32 counter, u32 min_start_group) {
+    counter++;
+    if (counter == ROSE_GROUPS_MAX) {
+        DEBUG_PRINTF("resetting groups\n");
+        counter = min_start_group;
+    }
+
+    return counter;
+}
+
+void assignGroupsToLiterals(RoseBuildImpl &build) {
+    auto &literals = build.literals;
+    auto &literal_info = build.literal_info;
+
+    bool small_literal_count = literal_info.size() <= MAX_LIGHT_LITERAL_CASE;
+
+    map<u8, u32> groupCount; /* group index to number of members */
+
+    u32 counter = 0;
+    u32 group_always_on = 0;
+
+    // First pass: handle always on literals.
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+        rose_literal_info &info = literal_info[id];
+
+        if (!requires_group_assignment(lit, info)) {
+            continue;
+        }
+
+        // If this literal has a root role, we always have to search for it
+        // anyway, so it goes in the always-on group.
+        /* We could end up squashing it if it is followed by a .* */
+        if (eligibleForAlwaysOnGroup(build, id)) {
+            info.group_mask = 1ULL << group_always_on;
+            groupCount[group_always_on]++;
+            continue;
+        }
+    }
+
+    u32 group_long_lit;
+    if (groupCount[group_always_on]) {
+        DEBUG_PRINTF("%u always on literals\n", groupCount[group_always_on]);
+        group_long_lit = group_always_on;
+        counter++;
+    } else {
+        group_long_lit = counter;
+        counter++;
+    }
+
+    u32 min_start_group = counter;
+    priority_queue<tuple<s32, s32, u32>> pq;
+
+    // Second pass: the other literals.
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+        rose_literal_info &info = literal_info[id];
+
+        if (!requires_group_assignment(lit, info)) {
+            continue;
+        }
+
+        assert(!eligibleForAlwaysOnGroup(build, id));
+        pq.emplace(-(s32)info.vertices.size(), -(s32)lit.s.length(), id);
+    }
+    vector<u32> long_lits;
+    while (!pq.empty()) {
+        u32 id = get<2>(pq.top());
+        pq.pop();
+        UNUSED const rose_literal_id &lit = literals.right.at(id);
+        DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,
+                     literal_info[id].vertices.size(), lit.s.length());
+
+        u8 group_id = 0;
+        rose_group group = ~0ULL;
+        for (auto v : getAssociatedVertices(build, id)) {
+            rose_group local_group = calcLocalGroup(v, build.g, literal_info,
+                                                    small_literal_count);
+            group &= local_group;
+            if (!group) {
+                break;
+            }
+        }
+
+        if (group == ~0ULL) {
+            goto boring;
+        }
+
+        group &= ~((1ULL << min_start_group) - 1); /* ensure the purity of the
+                                                    * always_on groups */
+        if (!group) {
+            goto boring;
+        }
+
+        group_id = ctz64(group);
+
+        /* TODO: fairness */
+        DEBUG_PRINTF("picking sibling group %hhd\n", group_id);
+        literal_info[id].group_mask = 1ULL << group_id;
+        groupCount[group_id]++;
+
+        continue;
+
+    boring:
+        /* long literals will either be stuck in a mega group or spread around
+         * depending on availability */
+        if (superStrong(lit)) {
+            long_lits.push_back(id);
+            continue;
+        }
+
+        // Other literals are assigned to our remaining groups round-robin.
+        group_id = counter;
+
+        DEBUG_PRINTF("picking boring group %hhd\n", group_id);
+        literal_info[id].group_mask = 1ULL << group_id;
+        groupCount[group_id]++;
+        counter = next_available_group(counter, min_start_group);
+    }
+
+    /* spread long literals out amongst unused groups if any, otherwise stick
+     * them in the always on the group */
+
+    if (groupCount[counter]) {
+        DEBUG_PRINTF("sticking long literals in the image of the always on\n");
+        for (u32 lit_id : long_lits) {
+            literal_info[lit_id].group_mask = 1ULL << group_long_lit;
+            groupCount[group_long_lit]++;
+        }
+    } else {
+        u32 min_long_counter = counter;
+        DEBUG_PRINTF("base long lit group = %u\n", min_long_counter);
+        for (u32 lit_id : long_lits) {
+            u8 group_id = counter;
+            literal_info[lit_id].group_mask = 1ULL << group_id;
+            groupCount[group_id]++;
+            counter = next_available_group(counter, min_long_counter);
+        }
+    }
+    /* assign delayed literals to the same group as their parent */
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+
+        if (!lit.delay) {
+            continue;
+        }
+
+        u32 parent = literal_info[id].undelayed_id;
+        DEBUG_PRINTF("%u is shadow picking up groups from %u\n", id, parent);
+        assert(literal_info[parent].undelayed_id == parent);
+        assert(literal_info[parent].group_mask);
+        literal_info[id].group_mask = literal_info[parent].group_mask;
+        /* don't increment the group count - these don't really exist */
+    }
+
+    DEBUG_PRINTF("populate group to literal mapping\n");
+    for (const u32 id : literals.right | map_keys) {
+        rose_group groups = literal_info[id].group_mask;
+        while (groups) {
+            u32 group_id = findAndClearLSB_64(&groups);
+            build.group_to_literal[group_id].insert(id);
+        }
+    }
+
+    /* find how many groups we allocated */
+    for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) {
+        if (groupCount[i]) {
+            build.group_end = max(build.group_end, i + 1);
+        }
+    }
+}
+
+rose_group RoseBuildImpl::getGroups(RoseVertex v) const {
+    rose_group groups = 0;
+
+    for (u32 id : g[v].literals) {
+        u32 lit_id = literal_info.at(id).undelayed_id;
+
+        rose_group mygroups = literal_info[lit_id].group_mask;
+        groups |= mygroups;
+    }
+
+    return groups;
+}
+
+/** \brief Get the groups of the successor literals of a given vertex. */
+rose_group RoseBuildImpl::getSuccGroups(RoseVertex start) const {
+    rose_group initialGroups = 0;
+
+    for (auto v : adjacent_vertices_range(start, g)) {
+        initialGroups |= getGroups(v);
+    }
+
+    return initialGroups;
+}
+
+/**
+ * The groups that a role sets are determined by the union of its successor
+ * literals. Requires the literals already have had groups assigned.
+ */
+void assignGroupsToRoles(RoseBuildImpl &build) {
+    auto &g = build.g;
+
+    /* Note: if there is a succ literal in the sidematcher, its successors
+     * literals must be added instead */
+    for (auto v : vertices_range(g)) {
+        if (build.isAnyStart(v)) {
+            continue;
+        }
+
+        const rose_group succ_groups = build.getSuccGroups(v);
+        g[v].groups |= succ_groups;
+
+        auto ghost_it = build.ghost.find(v);
+        if (ghost_it != end(build.ghost)) {
+            /* delayed roles need to supply their groups to the ghost role */
+            g[ghost_it->second].groups |= succ_groups;
+        }
+
+        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
+    }
+}
+
+/**
+ * \brief Returns a mapping from each graph vertex v to the intersection of the
+ * groups switched on by all of the paths leading up to (and including) v from
+ * the start vertexes.
+ */
+unordered_map<RoseVertex, rose_group>
+getVertexGroupMap(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    vector<RoseVertex> v_order;
+    v_order.reserve(num_vertices(g));
+
+    boost::topological_sort(g, back_inserter(v_order),
+                            vertex_index_map(get(&RoseVertexProps::idx, g)));
+
+    unordered_map<RoseVertex, rose_group> vertex_group_map;
+    vertex_group_map.reserve(num_vertices(g));
+
+    const rose_group initial_groups = build.getInitialGroups();
+
+    for (const auto &v : boost::adaptors::reverse(v_order)) {
+        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+
+        if (build.isAnyStart(v)) {
+            DEBUG_PRINTF("start vertex, groups=0x%llx\n", initial_groups);
+            vertex_group_map.emplace(v, initial_groups);
+            continue;
+        }
+
+        // To get to this vertex, we must have come through a predecessor, and
+        // everyone who isn't a start vertex has one.
+        assert(in_degree(v, g) > 0);
+        rose_group pred_groups = ~rose_group{0};
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            DEBUG_PRINTF("pred %zu\n", g[u].idx);
+            assert(contains(vertex_group_map, u));
+            pred_groups &= vertex_group_map.at(u);
+        }
+
+        DEBUG_PRINTF("pred_groups=0x%llx\n", pred_groups);
+        DEBUG_PRINTF("g[v].groups=0x%llx\n", g[v].groups);
+
+        rose_group v_groups = pred_groups | g[v].groups;
+        DEBUG_PRINTF("v_groups=0x%llx\n", v_groups);
+
+        vertex_group_map.emplace(v, v_groups);
+    }
+
+    return vertex_group_map;
+}
+
+/**
+ * \brief Find the set of groups that can be squashed anywhere in the graph,
+ * either by a literal or by a leftfix.
+ */
+rose_group getSquashableGroups(const RoseBuildImpl &build) {
+    rose_group squashable_groups = 0;
+    for (const auto &info : build.literal_info) {
+        if (info.squash_group) {
+            DEBUG_PRINTF("lit squash mask 0x%llx\n", info.group_mask);
+            squashable_groups |= info.group_mask;
+        }
+    }
+    for (const auto &m : build.rose_squash_masks) {
+        DEBUG_PRINTF("left squash mask 0x%llx\n", ~m.second);
+        squashable_groups |= ~m.second;
+    }
+
+    DEBUG_PRINTF("squashable groups=0x%llx\n", squashable_groups);
+    return squashable_groups;
+}
+
+/**
+ * \brief True if every vertex associated with a group also belongs to
+ * lit_info.
+ */
+static
+bool coversGroup(const RoseBuildImpl &build,
+                 const rose_literal_info &lit_info) {
+    if (lit_info.vertices.empty()) {
+        DEBUG_PRINTF("no vertices - does not cover\n");
+        return false;
+    }
+
+    if (!lit_info.group_mask) {
+        DEBUG_PRINTF("no group - does not cover\n");
+        return false; /* no group (not a floating lit?) */
+    }
+
+    assert(popcount64(lit_info.group_mask) == 1);
+
+    /* for each lit in group, ensure that vertices are a subset of lit_info's */
+    rose_group groups = lit_info.group_mask;
+    while (groups) {
+        u32 group_id = findAndClearLSB_64(&groups);
+        for (u32 id : build.group_to_literal.at(group_id)) {
+            DEBUG_PRINTF(" checking against friend %u\n", id);
+            if (!is_subset_of(build.literal_info[id].vertices,
+                              lit_info.vertices)) {
+                DEBUG_PRINTF("fail\n");
+                return false;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("ok\n");
+    return true;
+}
+
+static
+bool isGroupSquasher(const RoseBuildImpl &build, const u32 id /* literal id */,
+                     rose_group forbidden_squash_group) {
+    const RoseGraph &g = build.g;
+
+    const rose_literal_info &lit_info = build.literal_info.at(id);
+
+    DEBUG_PRINTF("checking if %u '%s' is a group squasher %016llx\n", id,
+                  dumpString(build.literals.right.at(id).s).c_str(),
+                  lit_info.group_mask);
+
+    if (build.literals.right.at(id).table == ROSE_EVENT) {
+        DEBUG_PRINTF("event literal, has no groups to squash\n");
+        return false;
+    }
+
+    if (!coversGroup(build, lit_info)) {
+        DEBUG_PRINTF("does not cover group\n");
+        return false;
+    }
+
+    if (lit_info.group_mask & forbidden_squash_group) {
+        /* probably a delayed lit */
+        DEBUG_PRINTF("skipping as involves a forbidden group\n");
+        return false;
+    }
+
+    // Single-vertex, less constrained case than the multiple-vertex one below.
+    if (lit_info.vertices.size() == 1) {
+        const RoseVertex &v = *lit_info.vertices.begin();
+
+        if (build.hasDelayPred(v)) { /* due to rebuild issues */
+            return false;
+        }
+
+        /* there are two ways to be a group squasher:
+         * 1) only care about the first accepted match
+         * 2) can only match once after a pred match
+         *
+         * (2) requires analysis of the infix before v and is not implemented,
+         * TODO
+         */
+
+        /* Case 1 */
+
+        // Can't squash cases with accepts
+        if (!g[v].reports.empty()) {
+            return false;
+        }
+
+        /* Can't squash cases with a suffix without analysis of the suffix.
+         * TODO: look at suffixes */
+        if (g[v].suffix) {
+            return false;
+        }
+
+        // Out-edges must have inf max bound, + no other shenanigans */
+        for (const auto &e : out_edges_range(v, g)) {
+            if (g[e].maxBound != ROSE_BOUND_INF) {
+                return false;
+            }
+
+            if (g[target(e, g)].left) {
+                return false; /* is an infix rose trigger, TODO: analysis */
+            }
+        }
+
+        DEBUG_PRINTF("%u is a path 1 group squasher\n", id);
+        return true;
+
+        /* note: we could also squash the groups of its preds (if nobody else is
+         * using them. TODO. */
+    }
+
+    // Multiple-vertex case
+    for (auto v : lit_info.vertices) {
+        assert(!build.isAnyStart(v));
+
+        // Can't squash cases with accepts
+        if (!g[v].reports.empty()) {
+            return false;
+        }
+
+        // Suffixes and leftfixes are out too as first literal may not match
+        // for everyone.
+        if (!g[v].isBoring()) {
+            return false;
+        }
+
+        /* TODO: checks are solid but we should explain */
+        if (build.hasDelayPred(v) || build.hasAnchoredTablePred(v)) {
+            return false;
+        }
+
+        // Out-edges must have inf max bound and not directly lead to another
+        // vertex with this group, e.g. 'foobar.*foobar'.
+        for (const auto &e : out_edges_range(v, g)) {
+            if (g[e].maxBound != ROSE_BOUND_INF) {
+                return false;
+            }
+            RoseVertex t = target(e, g);
+
+            if (g[t].left) {
+                return false; /* is an infix rose trigger */
+            }
+
+            for (u32 lit_id : g[t].literals) {
+                if (build.literal_info[lit_id].group_mask &
+                    lit_info.group_mask) {
+                    return false;
+                }
+            }
+        }
+
+        // In-edges must all be dot-stars with no overlap at all, as overlap
+        // also causes history to be used.
+        /* Different tables are already forbidden by previous checks */
+        for (const auto &e : in_edges_range(v, g)) {
+            if (!(g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF)) {
+                return false;
+            }
+
+            // Check overlap, if source was a literal.
+            RoseVertex u = source(e, g);
+            if (build.maxLiteralOverlap(u, v)) {
+                return false;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("literal %u is a multi-vertex group squasher\n", id);
+    return true;
+}
+
+void findGroupSquashers(RoseBuildImpl &build) {
+    rose_group forbidden_squash_group = 0;
+    for (const auto &e : build.literals.right) {
+        if (e.second.delay) {
+            forbidden_squash_group |= build.literal_info[e.first].group_mask;
+        }
+    }
+
+    for (u32 id = 0; id < build.literal_info.size(); id++) {
+        if (isGroupSquasher(build, id, forbidden_squash_group)) {
+            build.literal_info[id].squash_group = true;
+        }
+    }
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
new file mode 100644
index 00000000..3ab5eb78
--- /dev/null
+++ b/src/rose/rose_build_groups.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for analysing literal groups.
+ */
+
+#ifndef ROSE_BUILD_GROUPS_H
+#define ROSE_BUILD_GROUPS_H
+
+#include "rose_build_impl.h"
+#include "util/ue2_containers.h"
+
+namespace ue2 {
+
+unordered_map<RoseVertex, rose_group>
+getVertexGroupMap(const RoseBuildImpl &build);
+
+rose_group getSquashableGroups(const RoseBuildImpl &build);
+
+void assignGroupsToLiterals(RoseBuildImpl &build);
+
+void assignGroupsToRoles(RoseBuildImpl &build);
+
+void findGroupSquashers(RoseBuildImpl &build);
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_GROUPS_H
+
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 4122e0bd..d239a698 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -60,17 +60,19 @@ struct BoundaryReports;
 struct CastleProto;
 struct CompileContext;
 class ReportManager;
+class SmallWriteBuild;
 class SomSlotManager;
 
 struct suffix_id {
     suffix_id(const RoseSuffixInfo &in)
         : g(in.graph.get()), c(in.castle.get()), d(in.rdfa.get()),
-          h(in.haig.get()), dfa_min_width(in.dfa_min_width),
+          h(in.haig.get()), t(in.tamarama.get()),
+          dfa_min_width(in.dfa_min_width),
           dfa_max_width(in.dfa_max_width) {
             assert(!g || g->kind == NFA_SUFFIX);
     }
     bool operator==(const suffix_id &b) const {
-        bool rv = g == b.g && c == b.c && h == b.h && d == b.d;
+        bool rv = g == b.g && c == b.c && h == b.h && d == b.d && t == b.t;
         assert(!rv || dfa_min_width == b.dfa_min_width);
         assert(!rv || dfa_max_width == b.dfa_max_width);
         return rv;
@@ -82,6 +84,7 @@ struct suffix_id {
         ORDER_CHECK(c);
         ORDER_CHECK(d);
         ORDER_CHECK(h);
+        ORDER_CHECK(t);
         return false;
     }
 
@@ -113,6 +116,22 @@ struct suffix_id {
         }
         return c;
     }
+    TamaProto *tamarama() {
+        if (!d && !h) {
+            assert(dfa_min_width == depth(0));
+            assert(dfa_max_width == depth::infinity());
+        }
+        return t;
+    }
+    const TamaProto *tamarama() const {
+        if (!d && !h) {
+            assert(dfa_min_width == depth(0));
+            assert(dfa_max_width == depth::infinity());
+        }
+        return t;
+    }
+
+
     raw_som_dfa *haig() { return h; }
     const raw_som_dfa *haig() const { return h; }
     raw_dfa *dfa() { return d; }
@@ -125,6 +144,7 @@ private:
     CastleProto *c;
     raw_dfa *d;
     raw_som_dfa *h;
+    TamaProto *t;
     depth dfa_min_width;
     depth dfa_max_width;
 
@@ -150,7 +170,7 @@ struct left_id {
         : g(in.graph.get()), c(in.castle.get()), d(in.dfa.get()),
           h(in.haig.get()), dfa_min_width(in.dfa_min_width),
           dfa_max_width(in.dfa_max_width) {
-        assert(!g || !generates_callbacks(*g));
+        assert(!g || !has_managed_reports(*g));
     }
     bool operator==(const left_id &b) const {
         bool rv = g == b.g && c == b.c && h == b.h && d == b.d;
@@ -257,6 +277,17 @@ struct rose_literal_id {
     u32 distinctiveness;
 
     size_t elength(void) const { return s.length() + delay; }
+    size_t elength_including_mask(void) const {
+        size_t mask_len = msk.size();
+        for (u8 c : msk) {
+            if (!c) {
+                mask_len--;
+            } else {
+                break;
+            }
+        }
+        return MAX(mask_len, s.length()) + delay;
+    }
 };
 
 static inline
@@ -307,7 +338,7 @@ struct OutfixInfo {
     template<class T>
     explicit OutfixInfo(std::unique_ptr<T> x) : proto(std::move(x)) {}
 
-    explicit OutfixInfo(MpvProto mpv) : proto(std::move(mpv)) {}
+    explicit OutfixInfo(MpvProto mpv_in) : proto(std::move(mpv_in)) {}
 
     u32 get_queue(QueueIndexFactory &qif);
 
@@ -317,14 +348,14 @@ struct OutfixInfo {
     }
 
     bool is_nonempty_mpv() const {
-        auto *mpv = boost::get<MpvProto>(&proto);
-        return mpv && !mpv->empty();
+        auto *m = boost::get<MpvProto>(&proto);
+        return m && !m->empty();
     }
 
     bool is_dead() const {
-        auto *mpv = boost::get<MpvProto>(&proto);
-        if (mpv) {
-            return mpv->empty();
+        auto *m = boost::get<MpvProto>(&proto);
+        if (m) {
+            return m->empty();
         }
         return boost::get<boost::blank>(&proto) != nullptr;
     }
@@ -396,7 +427,7 @@ std::set<ReportID> all_reports(const OutfixInfo &outfix);
 // Concrete impl class
 class RoseBuildImpl : public RoseBuild {
 public:
-    RoseBuildImpl(ReportManager &rm, SomSlotManager &ssm,
+    RoseBuildImpl(ReportManager &rm, SomSlotManager &ssm, SmallWriteBuild &smwr,
                   const CompileContext &cc, const BoundaryReports &boundary);
 
     ~RoseBuildImpl() override;
@@ -439,10 +470,6 @@ public:
     // Find the maximum bound on the edges to this vertex's successors.
     u32 calcSuccMaxBound(RoseVertex u) const;
 
-    // Assign roles to groups, writing the groups bitset into each role in the
-    // graph.
-    void assignGroupsToRoles();
-
     /* Returns the ID of the given literal in the literal map, adding it if
      * necessary. */
     u32 getLiteralId(const ue2_literal &s, u32 delay, rose_literal_table table);
@@ -474,8 +501,6 @@ public:
     bool hasLiteralInTable(RoseVertex v, enum rose_literal_table t) const;
     bool hasAnchoredTablePred(RoseVertex v) const;
 
-    void assignGroupsToLiterals(void);
-
     // Is the given vertex a successor of either root or anchored_root?
     bool isRootSuccessor(const RoseVertex &v) const;
     /* Is the given vertex a successor of something other than root or
@@ -534,13 +559,18 @@ public:
     std::map<size_t, std::vector<std::unique_ptr<raw_dfa>>> anchored_nfas;
     std::map<simple_anchored_info, std::set<u32>> anchored_simple;
     std::map<u32, std::set<u32> > group_to_literal;
-    u32 group_weak_end;
     u32 group_end;
 
     u32 anchored_base_id;
 
     u32 ematcher_region_size; /**< number of bytes the eod table runs over */
 
+    /** \brief Mapping from leftfix to queue ID (used in dump code). */
+    unordered_map<left_id, u32> leftfix_queue_map;
+
+    /** \brief Mapping from suffix to queue ID (used in dump code). */
+    unordered_map<suffix_id, u32> suffix_queue_map;
+
     /** \brief Mapping from anchored literal ID to the original literal suffix
      * present when the literal was added to the literal matcher. Used for
      * overlap calculation in history assignment. */
@@ -566,6 +596,7 @@ public:
     QueueIndexFactory qif;
     ReportManager &rm;
     SomSlotManager &ssm;
+    SmallWriteBuild &smwr;
     const BoundaryReports &boundary;
 
 private:
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index 54c01e08..ba77b402 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -538,6 +538,36 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
     }
 }
 
+static
+bool hasSingleFloatingStart(const NGHolder &g) {
+    NFAVertex initial = NGHolder::null_vertex();
+    for (auto v : adjacent_vertices_range(g.startDs, g)) {
+        if (v == g.startDs) {
+            continue;
+        }
+        if (initial != NGHolder::null_vertex()) {
+            DEBUG_PRINTF("more than one start\n");
+            return false;
+        }
+        initial = v;
+    }
+
+    if (initial == NGHolder::null_vertex()) {
+        DEBUG_PRINTF("no floating starts\n");
+        return false;
+    }
+
+    // Anchored start must have no successors other than startDs and initial.
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        if (v != initial && v != g.startDs) {
+            DEBUG_PRINTF("anchored start\n");
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static
 bool getTransientPrefixReach(const NGHolder &g, u32 lag,
                              map<s32, CharReach> &look) {
@@ -546,15 +576,9 @@ bool getTransientPrefixReach(const NGHolder &g, u32 lag,
         return false;
     }
 
-    // Currently we don't handle anchored prefixes, as we would need to be able
-    // to represent the bounds from the anchor as well.
-    if (out_degree(g.start, g) != 1) {
-        DEBUG_PRINTF("anchored\n");
-        return false;
-    }
-
-    if (out_degree(g.startDs, g) != 2) {
-        DEBUG_PRINTF("more than one start\n");
+    // Must be a floating chain wired to startDs.
+    if (!hasSingleFloatingStart(g)) {
+        DEBUG_PRINTF("not a single floating start\n");
         return false;
     }
 
@@ -569,12 +593,28 @@ bool getTransientPrefixReach(const NGHolder &g, u32 lag,
 
         look[0 - i] = g[v].char_reach;
 
-        if (in_degree(v, g) != 1) {
+        NFAVertex next = NGHolder::null_vertex();
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == g.start) {
+                continue; // Benign, checked by hasSingleFloatingStart
+            }
+            if (next == NGHolder::null_vertex()) {
+                next = u;
+                continue;
+            }
             DEBUG_PRINTF("branch\n");
             return false;
         }
 
-        v = *(inv_adjacent_vertices(v, g).first);
+        if (next == NGHolder::null_vertex() || next == v) {
+            DEBUG_PRINTF("no predecessor or only self-loop\n");
+            // This graph is malformed -- all vertices in a graph that makes it
+            // to this analysis should have predecessors.
+            assert(0);
+            return false;
+        }
+
+        v = next;
         i++;
     }
 
@@ -644,6 +684,10 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
 
     lookaround.reserve(look.size());
     for (const auto &m : look) {
+        if (m.first < -128 || m.first > 127) {
+            DEBUG_PRINTF("range too big\n");
+            return false;
+        }
         s8 offset = verify_s8(m.first);
         lookaround.emplace_back(offset, m.second);
     }
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 83c49556..2eb70f60 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -38,12 +38,14 @@
 #include "hwlm/hwlm_build.h"
 #include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
+#include "nfa/nfa_api_queue.h"
 #include "util/charreach_util.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/dump_charclass.h"
 #include "util/report.h"
 #include "util/report_manager.h"
+#include "util/verify_types.h"
 #include "ue2common.h"
 
 #include <iomanip>
@@ -333,6 +335,80 @@ bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
     return true;
 }
 
+void findMoreLiteralMasks(RoseBuildImpl &build) {
+    if (!build.cc.grey.roseHamsterMasks) {
+        return;
+    }
+
+    vector<u32> candidates;
+    for (const auto &e : build.literals.right) {
+        const u32 id = e.first;
+        const auto &lit = e.second;
+
+        // This pass takes place before final IDs are assigned to literals.
+        assert(!build.hasFinalId(id));
+
+        if (lit.delay || build.isDelayed(id)) {
+            continue;
+        }
+
+        // Literal masks are only allowed for literals that will end up in an
+        // HWLM table.
+        switch (lit.table) {
+        case ROSE_FLOATING:
+        case ROSE_EOD_ANCHORED:
+        case ROSE_ANCHORED_SMALL_BLOCK:
+            break;
+        default:
+            continue;
+        }
+
+        if (!lit.msk.empty()) {
+            continue;
+        }
+
+        const auto &lit_info = build.literal_info.at(id);
+        if (lit_info.requires_benefits) {
+            continue;
+        }
+        candidates.push_back(id);
+    }
+
+    for (const u32 &id : candidates) {
+        const auto &lit = build.literals.right.at(id);
+        auto &lit_info = build.literal_info.at(id);
+
+        vector<u8> msk, cmp;
+        if (!findHamsterMask(build, lit, lit_info, msk, cmp)) {
+            continue;
+        }
+        assert(!msk.empty());
+        DEBUG_PRINTF("found advisory mask for lit_id=%u (%s)\n", id,
+                     dumpString(lit.s).c_str());
+        u32 new_id = build.getLiteralId(lit.s, msk, cmp, lit.delay, lit.table);
+        assert(new_id != id);
+        DEBUG_PRINTF("replacing with new lit_id=%u\n", new_id);
+
+        // Note that our new literal may already exist and have vertices, etc.
+        // We assume that this transform is happening prior to group assignment.
+        assert(lit_info.group_mask == 0);
+        auto &new_info = build.literal_info.at(new_id);
+
+        // Move the vertices across.
+        new_info.vertices.insert(begin(lit_info.vertices),
+                                 end(lit_info.vertices));
+        for (auto v : lit_info.vertices) {
+            build.g[v].literals.erase(id);
+            build.g[v].literals.insert(new_id);
+        }
+        lit_info.vertices.clear();
+
+        // Preserve other properties.
+        new_info.requires_explode = lit_info.requires_explode;
+        new_info.requires_benefits = lit_info.requires_benefits;
+    }
+}
+
 static
 bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
                         const rose_literal_info &info) {
@@ -340,8 +416,8 @@ bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
         return false;
     }
 
-    auto is_simple_exhaustible = [&build](ReportID id) {
-        const Report &report = build.rm.getReport(id);
+    auto is_simple_exhaustible = [&build](ReportID rid) {
+        const Report &report = build.rm.getReport(rid);
         return isSimpleExhaustible(report);
     };
 
@@ -359,7 +435,7 @@ bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
 
 // Called by isNoRunsLiteral below.
 static
-bool isNoRunsVertex(const RoseBuildImpl &build, NFAVertex u) {
+bool isNoRunsVertex(const RoseBuildImpl &build, RoseVertex u) {
     const RoseGraph &g = build.g;
     if (!g[u].isBoring()) {
         DEBUG_PRINTF("u=%zu is not boring\n", g[u].idx);
@@ -445,8 +521,111 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
     return true;
 }
 
+static
+const raw_puff &getChainedPuff(const RoseBuildImpl &build,
+                               const Report &report) {
+    DEBUG_PRINTF("chained report, event %u\n", report.onmatch);
+
+    // MPV has already been moved to the outfixes vector.
+    assert(!build.mpv_outfix);
+
+    auto mpv_outfix_it = find_if(
+        begin(build.outfixes), end(build.outfixes),
+        [](const OutfixInfo &outfix) { return outfix.is_nonempty_mpv(); });
+    assert(mpv_outfix_it != end(build.outfixes));
+    const auto *mpv = mpv_outfix_it->mpv();
+
+    u32 puff_index = report.onmatch - MQE_TOP_FIRST;
+    assert(puff_index < mpv->triggered_puffettes.size());
+    return mpv->triggered_puffettes.at(puff_index);
+}
+
+/**
+ * \brief Returns a conservative estimate of the minimum offset at which the
+ * given literal can lead to a report.
+ *
+ * TODO: This could be made more precise by calculating a "distance to accept"
+ * for every vertex in the graph; right now we're only accurate for leaf nodes.
+ */
+static
+u64a literalMinReportOffset(const RoseBuildImpl &build,
+                           const rose_literal_id &lit,
+                           const rose_literal_info &info) {
+    const auto &g = build.g;
+
+    const u32 lit_len = verify_u32(lit.elength());
+
+    u64a lit_min_offset = UINT64_MAX;
+
+    for (const auto &v : info.vertices) {
+        DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].idx, g[v].min_offset);
+
+        u64a vert_offset = g[v].min_offset;
+
+        if (vert_offset >= lit_min_offset) {
+            continue;
+        }
+
+        u64a min_offset = UINT64_MAX;
+
+        for (const auto &id : g[v].reports) {
+            const Report &report = build.rm.getReport(id);
+            DEBUG_PRINTF("report id %u, min offset=%llu\n", id,
+                         report.minOffset);
+            if (report.type == INTERNAL_ROSE_CHAIN) {
+                // This vertex triggers an MPV, which will fire reports after
+                // repeating for a while.
+                assert(report.minOffset == 0); // Should not have bounds.
+                const auto &puff = getChainedPuff(build, report);
+                DEBUG_PRINTF("chained puff repeats=%u\n", puff.repeats);
+                const Report &puff_report = build.rm.getReport(puff.report);
+                DEBUG_PRINTF("puff report %u, min offset=%llu\n", puff.report,
+                              puff_report.minOffset);
+                min_offset = min(min_offset, max(vert_offset + puff.repeats,
+                                                 puff_report.minOffset));
+            } else {
+                DEBUG_PRINTF("report min offset=%llu\n", report.minOffset);
+                min_offset = min(min_offset, max(vert_offset,
+                                                 report.minOffset));
+            }
+        }
+
+        if (g[v].suffix) {
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
+            assert(suffix_width.is_reachable());
+            DEBUG_PRINTF("suffix with width %s\n", suffix_width.str().c_str());
+            min_offset = min(min_offset, vert_offset + suffix_width);
+        }
+
+        if (!isLeafNode(v, g) || min_offset == UINT64_MAX) {
+            min_offset = vert_offset;
+        }
+
+        lit_min_offset = min(lit_min_offset, min_offset);
+    }
+
+    // If this literal in the undelayed literal corresponding to some delayed
+    // literals, we must take their minimum offsets into account.
+    for (const u32 &delayed_id : info.delayed_ids) {
+        const auto &delayed_lit = build.literals.right.at(delayed_id);
+        const auto &delayed_info = build.literal_info.at(delayed_id);
+        u64a delayed_min_offset = literalMinReportOffset(build, delayed_lit,
+                                                         delayed_info);
+        DEBUG_PRINTF("delayed_id=%u, min_offset = %llu\n", delayed_id,
+                     delayed_min_offset);
+        lit_min_offset = min(lit_min_offset, delayed_min_offset);
+    }
+
+    // If we share a vertex with a shorter literal, our min offset might dip
+    // below the length of this one.
+    lit_min_offset = max(lit_min_offset, u64a{lit_len});
+
+    return lit_min_offset;
+}
+
 vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                                           rose_literal_table table) {
+                                           rose_literal_table table,
+                                           u32 max_offset) {
     vector<hwlmLiteral> lits;
 
     for (const auto &e : build.literals.right) {
@@ -472,33 +651,40 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 
         DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
 
-        vector<u8> msk = e.second.msk; // copy
-        vector<u8> cmp = e.second.cmp; // copy
-
-        if (msk.empty()) {
-            // Try and pick up an advisory mask.
-            if (!findHamsterMask(build, e.second, info, msk, cmp)) {
-                msk.clear(); cmp.clear();
-            } else {
-                DEBUG_PRINTF("picked up late mask %zu\n", msk.size());
+        if (max_offset != ROSE_BOUND_INF) {
+            u64a min_report = literalMinReportOffset(build, e.second, info);
+            if (min_report > max_offset) {
+                DEBUG_PRINTF("min report offset=%llu exceeds max_offset=%u\n",
+                             min_report, max_offset);
+                continue;
             }
         }
 
+        const vector<u8> &msk = e.second.msk;
+        const vector<u8> &cmp = e.second.cmp;
+
         bool noruns = isNoRunsLiteral(build, id, info);
 
         if (info.requires_explode) {
             DEBUG_PRINTF("exploding lit\n");
-            const vector<u8> empty_msk; // msk/cmp will be empty
             case_iter cit = caseIterateBegin(lit);
             case_iter cite = caseIterateEnd();
             for (; cit != cite; ++cit) {
+                string s = *cit;
+                bool nocase = false;
+
                 DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d msk=%s, "
                              "cmp=%s (exploded)\n",
-                             final_id, escapeString(lit.get_string()).c_str(),
-                             0, noruns, dumpMask(msk).c_str(),
-                             dumpMask(cmp).c_str());
-                lits.emplace_back(*cit, false, noruns, final_id, groups,
-                                  empty_msk, empty_msk);
+                             final_id, escapeString(s).c_str(), nocase, noruns,
+                             dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+
+                if (!maskIsConsistent(s, nocase, msk, cmp)) {
+                    DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+                    continue;
+                }
+
+                lits.emplace_back(move(s), nocase, noruns, final_id, groups,
+                                  msk, cmp);
             }
         } else {
             const std::string &s = lit.get_string();
@@ -514,8 +700,7 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
                 continue;
             }
 
-            lits.emplace_back(lit.get_string(), lit.any_nocase(), noruns,
-                              final_id, groups, msk, cmp);
+            lits.emplace_back(s, nocase, noruns, final_id, groups, msk, cmp);
         }
     }
 
@@ -523,10 +708,12 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 }
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              rose_group *fgroups,
                                               size_t *fsize,
                                               size_t *historyRequired,
                                               size_t *streamStateRequired) {
     *fsize = 0;
+    *fgroups = 0;
 
     auto fl = fillHamsterLiteralList(build, ROSE_FLOATING);
     if (fl.empty()) {
@@ -534,6 +721,10 @@ aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
+    for (const hwlmLiteral &hlit : fl) {
+        *fgroups |= hlit.groups;
+    }
+
     hwlmStreamingControl ctl;
     hwlmStreamingControl *ctlp;
     if (build.cc.streaming) {
@@ -587,7 +778,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING);
+    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING,
+                                       ROSE_SMALL_BLOCK_LEN);
     if (lits.empty()) {
         DEBUG_PRINTF("no floating table\n");
         return nullptr;
@@ -596,8 +788,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto anchored_lits =
-        fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
+    auto anchored_lits = fillHamsterLiteralList(build,
+                            ROSE_ANCHORED_SMALL_BLOCK, ROSE_SMALL_BLOCK_LEN);
     if (anchored_lits.empty()) {
         DEBUG_PRINTF("no small-block anchored literals\n");
         return nullptr;
@@ -605,15 +797,10 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
 
     lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
 
-    // Remove literals that are longer than our small block length, as they can
-    // never match. TODO: improve by removing literals that have a min match
-    // offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases
-    // with preceding dots that put them over the limit.
-    auto longer_than_limit = [](const hwlmLiteral &lit) {
-        return lit.s.length() > ROSE_SMALL_BLOCK_LEN;
-    };
-    lits.erase(remove_if(lits.begin(), lits.end(), longer_than_limit),
-               lits.end());
+    // None of our literals should be longer than the small block limit.
+    assert(all_of(begin(lits), end(lits), [](const hwlmLiteral &lit) {
+        return lit.s.length() <= ROSE_SMALL_BLOCK_LEN;
+    }));
 
     if (lits.empty()) {
         DEBUG_PRINTF("no literals shorter than small block len\n");
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 9781f514..2a225bf5 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -44,10 +44,17 @@ namespace ue2 {
 
 struct hwlmLiteral;
 
+/**
+ * \brief Build up a vector of literals for the given table.
+ *
+ * If max_offset is specified (and not ROSE_BOUND_INF), then literals that can
+ * only lead to a pattern match after max_offset may be excluded.
+ */
 std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                                                rose_literal_table table);
+                    rose_literal_table table, u32 max_offset = ROSE_BOUND_INF);
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              rose_group *fgroups,
                                               size_t *fsize,
                                               size_t *historyRequired,
                                               size_t *streamStateRequired);
@@ -58,6 +65,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
 aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
                                                  size_t *esize);
 
+void findMoreLiteralMasks(RoseBuildImpl &build);
+
 } // namespace ue2
 
 #endif // ROSE_BUILD_MATCHERS_H
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index a10bc86e..759e0dbe 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -338,7 +338,7 @@ void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
                            deque<UncalcLeafKey> &ordered) {
     const RoseGraph &g = tbi.g;
 
-    vector<NFAVertex> suffix_vertices; // vertices with suffix graphs
+    vector<RoseVertex> suffix_vertices; // vertices with suffix graphs
     ue2::unordered_map<const NGHolder *, u32> fcount; // ref count per graph
 
     for (auto v : vertices_range(g)) {
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index b16e3a69..c2f9f580 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -34,6 +34,7 @@
 #include "nfa/mcclellancompile_util.h"
 #include "nfa/nfa_api.h"
 #include "nfa/rdfa.h"
+#include "nfa/tamaramacompile.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_reports.h"
@@ -66,7 +67,9 @@ namespace ue2 {
 // just to get it out of the header
 RoseBuild::~RoseBuild() { }
 
-RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
+RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
+                             SomSlotManager &ssm_in,
+                             SmallWriteBuild &smwr_in,
                              const CompileContext &cc_in,
                              const BoundaryReports &boundary_in)
     : cc(cc_in),
@@ -75,7 +78,6 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       vertexIndex(0),
       delay_base_id(MO_INVALID_IDX),
       hasSom(false),
-      group_weak_end(0),
       group_end(0),
       anchored_base_id(MO_INVALID_IDX),
       ematcher_region_size(0),
@@ -83,6 +85,7 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       max_rose_anchored_floating_overlap(0),
       rm(rm_in),
       ssm(ssm_in),
+      smwr(smwr_in),
       boundary(boundary_in),
       next_nfa_report(0) {
     // add root vertices to graph
@@ -233,10 +236,12 @@ size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const {
 }
 
 // RoseBuild factory
-unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm, SomSlotManager &ssm,
+unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
+                                      SomSlotManager &ssm,
+                                      SmallWriteBuild &smwr,
                                       const CompileContext &cc,
                                       const BoundaryReports &boundary) {
-    return ue2::make_unique<RoseBuildImpl>(rm, ssm, cc, boundary);
+    return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
 }
 
 size_t roseSize(const RoseEngine *t) {
@@ -538,11 +543,11 @@ static
 bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                     const Grey &grey) {
     /* TODO: tighten */
-    NFAVertex seen_vert = NFAGraph::null_vertex();
+    NFAVertex seen_vert = NGHolder::null_vertex();
 
     for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
         if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NFAGraph::null_vertex()) {
+            if (seen_vert != NGHolder::null_vertex()) {
                 return true;
             }
             seen_vert = v;
@@ -551,7 +556,7 @@ bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
 
     for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
         if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NFAGraph::null_vertex()) {
+            if (seen_vert != NGHolder::null_vertex()) {
                 return true;
             }
             seen_vert = v;
@@ -581,8 +586,12 @@ public:
     bool requiresDedupeSupport(
         const ue2::flat_set<ReportID> &reports) const override;
 
+private:
+    bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
+
     const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex>> vert_map;
+    map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
+    map<ReportID, set<RoseVertex>> sb_vert_map; //!< small block literals
     map<ReportID, set<suffix_id>> suffix_map;
     map<ReportID, set<const OutfixInfo *>> outfix_map;
     map<ReportID, set<const raw_puff *>> puff_map;
@@ -602,10 +611,14 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     set<suffix_id> suffixes;
 
     for (auto v : vertices_range(g)) {
-        // Literals in the small block table don't count as dupes: although
-        // they have copies in the anchored table, the two are never run in the
-        // same runtime invocation. All other literals count, though.
-        if (!tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
+        // Literals in the small block table are "shadow" copies of literals in
+        // the other tables that do not run in the same runtime invocation.
+        // Dedupe key assignment will be taken care of by the real literals.
+        if (tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
+            for (const auto &report_id : g[v].reports) {
+                sb_vert_map[report_id].insert(v);
+            }
+        } else {
             for (const auto &report_id : g[v].reports) {
                 vert_map[report_id].insert(v);
             }
@@ -673,19 +686,54 @@ bool literalsCouldRace(const rose_literal_id &lit1,
     return r.first == smaller->rend();
 }
 
+bool RoseDedupeAuxImpl::hasSafeMultiReports(
+    const flat_set<ReportID> &reports) const {
+    if (reports.size() <= 1) {
+        return true;
+    }
+
+    /* We have more than one ReportID corresponding to the external ID that is
+     * presented to the user. These may differ in offset adjustment, bounds
+     * checks, etc. */
+
+    /* TODO: work out if these differences will actually cause problems */
+
+    /* One common case where we know we don't have a problem is if there are
+     * precisely two reports, one for the main Rose path and one for the
+     * "small block matcher" path. */
+    if (reports.size() == 2) {
+        ReportID id1 = *reports.begin();
+        ReportID id2 = *reports.rbegin();
+
+        bool has_verts_1 = contains(vert_map, id1);
+        bool has_verts_2 = contains(vert_map, id2);
+        bool has_sb_verts_1 = contains(sb_vert_map, id1);
+        bool has_sb_verts_2 = contains(sb_vert_map, id2);
+
+        if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) {
+            DEBUG_PRINTF("two reports, one full and one small block: ok\n");
+            return true;
+        }
+    }
+
+    DEBUG_PRINTF("more than one report\n");
+    return false;
+}
+
 bool RoseDedupeAuxImpl::requiresDedupeSupport(
     const ue2::flat_set<ReportID> &reports) const {
     /* TODO: this could be expanded to check for offset or character
        constraints */
 
+    DEBUG_PRINTF("reports: %s\n", as_string_list(reports).c_str());
+
     const RoseGraph &g = tbi.g;
 
     bool has_suffix = false;
     bool has_outfix = false;
 
-    if (reports.size() > 1) {
-        /* may have offset adjust */
-        /* TODO: work out if the offset adjust will actually cause problems */
+    if (!hasSafeMultiReports(reports)) {
+        DEBUG_PRINTF("multiple reports not safe\n");
         return true;
     }
 
@@ -697,7 +745,6 @@ bool RoseDedupeAuxImpl::requiresDedupeSupport(
         if (contains(vert_map, r)) {
             insert(&roles, vert_map.at(r));
         }
-
         if (contains(suffix_map, r)) {
             insert(&suffixes, suffix_map.at(r));
         }
@@ -880,7 +927,7 @@ namespace {
 class OutfixAllReports : public boost::static_visitor<set<ReportID>> {
 public:
     set<ReportID> operator()(const boost::blank &) const {
-        return {};
+        return set<ReportID>();
     }
 
     template<class T>
@@ -909,7 +956,7 @@ set<ReportID> all_reports(const OutfixInfo &outfix) {
 
 bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const {
     return top == b.top && graph == b.graph && castle == b.castle &&
-           rdfa == b.rdfa && haig == b.haig;
+           rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama;
 }
 
 bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
@@ -919,6 +966,7 @@ bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
     ORDER_CHECK(castle);
     ORDER_CHECK(haig);
     ORDER_CHECK(rdfa);
+    ORDER_CHECK(tamarama);
     assert(a.dfa_min_width == b.dfa_min_width);
     assert(a.dfa_max_width == b.dfa_max_width);
     return false;
@@ -931,13 +979,16 @@ void RoseSuffixInfo::reset(void) {
     castle.reset();
     rdfa.reset();
     haig.reset();
+    tamarama.reset();
     dfa_min_width = 0;
     dfa_max_width = depth::infinity();
 }
 
 std::set<ReportID> all_reports(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
-    if (s.graph()) {
+    if (s.tamarama()) {
+        return all_reports(*s.tamarama());
+    } else if (s.graph()) {
         return all_reports(*s.graph());
     } else if (s.castle()) {
         return all_reports(*s.castle());
@@ -1149,6 +1200,7 @@ void LeftEngInfo::reset(void) {
     castle.reset();
     dfa.reset();
     haig.reset();
+    tamarama.reset();
     lag = 0;
     leftfix_report = MO_INVALID_IDX;
     dfa_min_width = 0;
@@ -1187,6 +1239,11 @@ u32 roseQuality(const RoseEngine *t) {
         always_run++;
     }
 
+    if (t->eagerIterOffset) {
+        /* eager prefixes are always run */
+        always_run++;
+    }
+
     const HWLM *ftable = getFLiteralMatcher(t);
     if (ftable) {
         /* TODO: ignore conditional ftables, or ftables beyond smwr region */
@@ -1227,30 +1284,6 @@ u32 roseQuality(const RoseEngine *t) {
     return 1;
 }
 
-/** \brief Add a SMWR engine to the given RoseEngine. */
-aligned_unique_ptr<RoseEngine> roseAddSmallWrite(const RoseEngine *t,
-                                                 const SmallWriteEngine *smwr) {
-    assert(t);
-    assert(smwr);
-
-    const u32 mainSize = roseSize(t);
-    const u32 smallWriteSize = smwrSize(smwr);
-
-    u32 smwrOffset = ROUNDUP_CL(mainSize);
-    u32 newSize = smwrOffset + smallWriteSize;
-
-    aligned_unique_ptr<RoseEngine> t2 =
-        aligned_zmalloc_unique<RoseEngine>(newSize);
-    char *ptr = (char *)t2.get();
-    memcpy(ptr, t, mainSize);
-    memcpy(ptr + smwrOffset, smwr, smallWriteSize);
-
-    t2->smallWriteOffset = smwrOffset;
-    t2->size = newSize;
-
-    return t2;
-}
-
 #ifndef NDEBUG
 /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose
  * graph are implementable. */
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 1f873403..c2366f0e 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -156,22 +156,31 @@ private:
     ue2::unordered_set<RoseVertex> hash_cont; /* member checks */
 };
 
-/**
- * \brief Mapping from a particular rose engine to a set of associated
- * vertices.
- */
-typedef ue2::unordered_map<left_id, set<RoseVertex> > revRoseMap;
+struct RoseAliasingInfo {
+    RoseAliasingInfo(const RoseBuildImpl &build) {
+        const auto &g = build.g;
 
-} // namespace
+        // Populate reverse leftfix map.
+        for (auto v : vertices_range(g)) {
+            if (g[v].left) {
+                rev_leftfix[g[v].left].insert(v);
+            }
+        }
 
-static
-void populateRevRoseMap(const RoseGraph &g, revRoseMap *out) {
-    for (auto v : vertices_range(g)) {
-        if (g[v].left) {
-            (*out)[g[v].left].insert(v);
+        // Populate reverse ghost vertex map.
+        for (const auto &m : build.ghost) {
+            rev_ghost[m.second].insert(m.first);
         }
     }
-}
+
+    /** \brief Mapping from leftfix to vertices. */
+    ue2::unordered_map<left_id, set<RoseVertex>> rev_leftfix;
+
+    /** \brief Mapping from undelayed ghost to delayed vertices. */
+    ue2::unordered_map<RoseVertex, set<RoseVertex>> rev_ghost;
+};
+
+} // namespace
 
 // Check successor set: must lead to the same vertices via edges with the
 // same properties.
@@ -262,7 +271,8 @@ bool samePredecessors(RoseVertex a, RoseVertex b, const RoseGraph &g) {
 }
 
 static
-bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g) {
+bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b,
+                                const RoseGraph &g) {
     for (const auto &e_a : out_edges_range(a, g)) {
         bool exists;
         RoseEdge e;
@@ -283,7 +293,8 @@ bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g)
 }
 
 static
-bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g) {
+bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b,
+                                const RoseGraph &g) {
     for (const auto &e_a : in_edges_range(a, g)) {
         bool exists;
         RoseEdge e;
@@ -309,23 +320,24 @@ bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g)
 }
 
 static
-bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &tbi) {
-    const auto &lits_a = tbi.g[a].literals;
-    const auto &lits_b = tbi.g[b].literals;
+bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &build) {
+    const auto &lits_a = build.g[a].literals;
+    const auto &lits_b = build.g[b].literals;
     assert(!lits_a.empty() && !lits_b.empty());
 
     // If both vertices have only pseudo-dotstar in-edges, we can merge
     // literals of different lengths and can avoid the check below.
-    if (tbi.hasOnlyPseudoStarInEdges(a) && tbi.hasOnlyPseudoStarInEdges(b)) {
+    if (build.hasOnlyPseudoStarInEdges(a) &&
+        build.hasOnlyPseudoStarInEdges(b)) {
         DEBUG_PRINTF("both have pseudo-dotstar in-edges\n");
         return true;
     }
 
     // Otherwise, all the literals involved must have the same length.
     for (u32 a_id : lits_a) {
-        const rose_literal_id &la = tbi.literals.right.at(a_id);
+        const rose_literal_id &la = build.literals.right.at(a_id);
         for (u32 b_id : lits_b) {
-            const rose_literal_id &lb = tbi.literals.right.at(b_id);
+            const rose_literal_id &lb = build.literals.right.at(b_id);
 
             if (la.elength() != lb.elength()) {
                 DEBUG_PRINTF("bad merge %zu!=%zu '%s', '%s'\n", la.elength(),
@@ -339,8 +351,8 @@ bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &tbi) {
 }
 
 static
-bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &tbi) {
-    const RoseVertexProps &props = tbi.g[v];
+bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &build) {
+    const RoseVertexProps &props = build.g[v];
 
     // Must have literals.
     if (props.literals.empty()) {
@@ -348,14 +360,43 @@ bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &tbi) {
     }
 
     assert(*props.literals.begin() != MO_INVALID_IDX);
+    return true;
+}
 
-    // Any vertex involved in a "ghost" relationship has already been disallowed
+static
+bool sameGhostProperties(const RoseBuildImpl &build,
+                         const RoseAliasingInfo &rai, RoseVertex a,
+                         RoseVertex b) {
+    // If these are ghost mapping keys, then they must map to the same vertex.
+    if (contains(build.ghost, a) || contains(build.ghost, b)) {
+        DEBUG_PRINTF("checking ghost key compat\n");
+        if (!contains(build.ghost, a) || !contains(build.ghost, b)) {
+            DEBUG_PRINTF("missing ghost mapping\n");
+            return false;
+        }
+        if (build.ghost.at(a) != build.ghost.at(b)) {
+            DEBUG_PRINTF("diff ghost mapping\n");
+            return false;
+        }
+        DEBUG_PRINTF("ghost mappings ok\n");
+        return true;
+    }
+
+    // If they are ghost vertices, then they must have the same literals.
+    if (contains(rai.rev_ghost, a) || contains(rai.rev_ghost, b)) {
+        if (!contains(rai.rev_ghost, a) || !contains(rai.rev_ghost, b)) {
+            DEBUG_PRINTF("missing ghost reverse mapping\n");
+            return false;
+        }
+        return build.g[a].literals == build.g[b].literals;
+    }
 
     return true;
 }
 
 static
-bool sameRoleProperties(const RoseBuildImpl &build, RoseVertex a, RoseVertex b) {
+bool sameRoleProperties(const RoseBuildImpl &build, const RoseAliasingInfo &rai,
+                        RoseVertex a, RoseVertex b) {
     const RoseGraph &g = build.g;
     const RoseVertexProps &aprops = g[a], &bprops = g[b];
 
@@ -380,13 +421,17 @@ bool sameRoleProperties(const RoseBuildImpl &build, RoseVertex a, RoseVertex b)
         return false;
     }
 
+    if (!sameGhostProperties(build, rai, a, b)) {
+        return false;
+    }
+
     /* "roses are mergeable" check are handled elsewhere  */
 
     return true;
 }
 
-/* Checks compatibility of role properties if we require that two roles are right
- * equiv. */
+/* Checks compatibility of role properties if we require that two roles are
+ * right equiv. */
 static
 bool sameRightRoleProperties(const RoseBuildImpl &build, RoseVertex a,
                              RoseVertex b) {
@@ -448,15 +493,6 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
     return val;
 }
 
-static
-void removeVertexFromMaps(RoseVertex v, RoseBuildImpl &build, revRoseMap &rrm) {
-    if (build.g[v].left) {
-        const left_id left(build.g[v].left);
-        assert(contains(rrm[left], v));
-        rrm[left].erase(v);
-    }
-}
-
 static
 void mergeEdgeAdd(RoseVertex u, RoseVertex v, const RoseEdge &from_edge,
                   const RoseEdge *to_edge, RoseGraph &g) {
@@ -485,7 +521,7 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
     // Cache b's in-edges so we can look them up by source quickly.
     for (const auto &e : in_edges_range(b, g)) {
         RoseVertex u = source(e, g);
-        b_edges.insert(make_pair(u, e));
+        b_edges.emplace(u, e);
     }
 
     // Add a's in-edges to b, merging them in where b already has the new edge.
@@ -504,7 +540,7 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
     b_edges.clear();
     for (const auto &e : out_edges_range(b, g)) {
         RoseVertex v = target(e, g);
-        b_edges.insert(make_pair(v, e));
+        b_edges.emplace(v, e);
     }
 
     // Add a's out-edges to b, merging them in where b already has the new edge.
@@ -524,11 +560,11 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
 }
 
 static
-void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
-    RoseGraph &g = tbi.g;
+void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &build) {
+    RoseGraph &g = build.g;
     const auto &a_literals = g[a].literals;
     for (u32 lit_id : a_literals) {
-        auto &lit_vertices = tbi.literal_info[lit_id].vertices;
+        auto &lit_vertices = build.literal_info[lit_id].vertices;
         lit_vertices.erase(a);
         lit_vertices.insert(b);
     }
@@ -536,37 +572,91 @@ void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
     insert(&g[b].literals, a_literals);
 }
 
-// Merge role 'a' into 'b'.
 static
-void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
-                   revRoseMap &rrm) {
-    RoseGraph &g = tbi.g;
-    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+void updateAliasingInfo(RoseBuildImpl &build, RoseAliasingInfo &rai,
+                        RoseVertex a, RoseVertex b) {
+    if (build.g[a].left) {
+        const left_id left(build.g[a].left);
+        assert(contains(rai.rev_leftfix[left], a));
+        rai.rev_leftfix[left].erase(a);
+    }
+    if (contains(build.ghost, a)) {
+        auto ghost = build.ghost.at(a);
+        assert(contains(build.ghost, b) && ghost == build.ghost.at(b));
+        build.ghost.erase(a);
+        rai.rev_ghost[ghost].erase(a);
+    }
+
+    if (contains(rai.rev_ghost, a)) {
+        for (const auto &v : rai.rev_ghost[a]) {
+            build.ghost[v] = b;
+            rai.rev_ghost[b].insert(v);
+        }
+        rai.rev_ghost.erase(a);
+    }
+}
+
+/** \brief Common role merge code used by variants below. */
+static
+void mergeCommon(RoseBuildImpl &build, RoseAliasingInfo &rai, RoseVertex a,
+                 RoseVertex b) {
+    RoseGraph &g = build.g;
 
-    // Merge role properties.
     assert(g[a].eod_accept == g[b].eod_accept);
     assert(g[a].left == g[b].left);
-
-    insert(&g[b].reports, g[a].reports);
+    assert(!g[a].suffix || g[a].suffix == g[b].suffix);
 
     // In some situations (ghost roles etc), we can have different groups.
     assert(!g[a].groups && !g[b].groups); /* current structure means groups
                                            * haven't been assigned yet */
     g[b].groups |= g[a].groups;
 
-    g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
-    g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
+    mergeLiteralSets(a, b, build);
+    updateAliasingInfo(build, rai, a, b);
 
-    mergeLiteralSets(a, b, tbi);
+    // Our min and max_offsets should be sane.
+    assert(g[b].min_offset <= g[b].max_offset);
+
+    // Safety check: we should not have created through a merge a vertex that
+    // has an out-edge with ANCH history but is not fixed-offset.
+    assert(!hasAnchHistorySucc(g, b) || g[b].fixedOffset());
+}
+
+/** \brief Merge role 'a' into 'b', left merge path. */
+static
+void mergeVerticesLeft(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
+                       RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+
+    insert(&g[b].reports, g[a].reports);
+
+    // Since it is a left merge (identical LHS) we should pick the tighter
+    // bound.
+    g[b].min_offset = max(g[a].min_offset, g[b].min_offset);
+    g[b].max_offset = min(g[a].max_offset, g[b].max_offset);
 
     if (!g[b].suffix) {
         g[b].suffix = g[a].suffix;
-    } else {
-        assert(!g[a].suffix || g[b].suffix == g[a].suffix);
     }
 
     mergeEdges(a, b, g);
-    removeVertexFromMaps(a, tbi, rrm);
+    mergeCommon(build, rai, a, b);
+}
+
+/** \brief Merge role 'a' into 'b', right merge path. */
+static
+void mergeVerticesRight(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
+                        RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+
+    insert(&g[b].reports, g[a].reports);
+    g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
+    g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
+
+    mergeEdges(a, b, g);
+    mergeCommon(build, rai, a, b);
 }
 
 /**
@@ -574,57 +664,35 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
  * that the in- and out-edge sets, reports and suffixes are identical.
  */
 static
-void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
-                          revRoseMap &rrm) {
-    RoseGraph &g = tbi.g;
+void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
+                          RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
-    // Merge role properties. For a diamond merge, most properties are already
-    // the same (with the notable exception of the literal set).
-    assert(g[a].eod_accept == g[b].eod_accept);
-    assert(g[a].left == g[b].left);
+    // For a diamond merge, most properties are already the same (with the
+    // notable exception of the literal set).
     assert(g[a].reports == g[b].reports);
     assert(g[a].suffix == g[b].suffix);
 
-    // In some situations (ghost roles etc), we can have different groups.
-    assert(!g[a].groups && !g[b].groups); /* current structure means groups
-                                           * haven't been assigned yet */
-    g[b].groups |= g[a].groups;
-
     g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
-    mergeLiteralSets(a, b, tbi);
-    removeVertexFromMaps(a, tbi, rrm);
+    mergeCommon(build, rai, a, b);
 }
 
 static never_inline
-void findCandidates(const RoseBuildImpl &tbi, CandidateSet *candidates) {
-    ue2::unordered_set<RoseVertex> disallowed;
-
-    // We currently deny candidature to any vertex involved in a "ghost"
-    // relationship.
-    for (const auto &m : tbi.ghost) {
-        disallowed.insert(m.first);
-        disallowed.insert(m.second);
-    }
-
-    for (auto v : vertices_range(tbi.g)) {
-        // Ignore ghost relationships.
-        if (contains(disallowed, v)) {
-            continue;
-        }
-
-        if (isAliasingCandidate(v, tbi)) {
-            DEBUG_PRINTF("candidate %zu\n", tbi.g[v].idx);
-            DEBUG_PRINTF("lits: %u\n", *tbi.g[v].literals.begin());
+void findCandidates(const RoseBuildImpl &build, CandidateSet *candidates) {
+    for (auto v : vertices_range(build.g)) {
+        if (isAliasingCandidate(v, build)) {
+            DEBUG_PRINTF("candidate %zu\n", build.g[v].idx);
+            DEBUG_PRINTF("lits: %u\n", *build.g[v].literals.begin());
             candidates->insert(v);
         }
     }
 
-    assert(candidates->size() <= num_vertices(tbi.g));
+    assert(candidates->size() <= num_vertices(build.g));
     DEBUG_PRINTF("found %zu/%zu candidates\n", candidates->size(),
-                 num_vertices(tbi.g));
+                 num_vertices(build.g));
 }
 
 static
@@ -639,7 +707,7 @@ RoseVertex pickSucc(const RoseVertex v, const RoseGraph &g) {
 
 static
 RoseVertex pickPred(const RoseVertex v, const RoseGraph &g,
-                    const RoseBuildImpl &tbi) {
+                    const RoseBuildImpl &build) {
     RoseGraph::in_edge_iterator ei, ee;
     tie(ei, ee) = in_edges(v, g);
     if (ei == ee) {
@@ -650,7 +718,7 @@ RoseVertex pickPred(const RoseVertex v, const RoseGraph &g,
     // Avoid roots if we have other options, since it doesn't matter to the
     // merge pass which predecessor we pick.
     RoseVertex u = source(*ei, g);
-    while (tbi.isAnyStart(u) && ++ei != ee) {
+    while (build.isAnyStart(u) && ++ei != ee) {
         u = source(*ei, g);
     }
     return u;
@@ -700,12 +768,13 @@ bool hasCommonPredWithDiffRoses(RoseVertex a, RoseVertex b,
 }
 
 static
-void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
+void pruneReportIfUnused(const RoseBuildImpl &build, shared_ptr<NGHolder> h,
                          const set<RoseVertex> &verts, ReportID report) {
     DEBUG_PRINTF("trying to prune %u from %p (v %zu)\n", report, h.get(),
                  verts.size());
     for (RoseVertex v : verts) {
-        if (tbi.g[v].left.graph == h && tbi.g[v].left.leftfix_report == report) {
+        if (build.g[v].left.graph == h &&
+            build.g[v].left.leftfix_report == report) {
             DEBUG_PRINTF("report %u still in use\n", report);
             return;
         }
@@ -717,12 +786,12 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
         // unimplementable.
 
         DEBUG_PRINTF("report %u has been merged away, pruning\n", report);
-        assert(h->kind == tbi.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
-                                                              : NFA_INFIX);
+        assert(h->kind == build.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
+                                                                : NFA_INFIX);
         unique_ptr<NGHolder> h_new = cloneHolder(*h);
         pruneReport(*h_new, report);
 
-        if (isImplementableNFA(*h_new, nullptr, tbi.cc)) {
+        if (isImplementableNFA(*h_new, nullptr, build.cc)) {
             clear_graph(*h);
             cloneHolder(*h, *h_new);
         } else {
@@ -828,9 +897,9 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
 }
 
 static
-bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
-                     revRoseMap &rrm) {
-    RoseGraph &g = tbi.g;
+bool mergeSameCastle(RoseBuildImpl &build, RoseVertex a, RoseVertex b,
+                     RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     CastleProto &castle = *a_left.castle;
@@ -853,7 +922,7 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
         return false;
     }
 
-    const ReportID new_report = tbi.getNewNfaReport();
+    const ReportID new_report = build.getNewNfaReport();
     map<u32, u32> a_top_map, b_top_map;
 
     for (const auto &c : castle.repeats) {
@@ -875,9 +944,9 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
         }
     }
 
-    assert(contains(rrm[b_left], b));
-    rrm[b_left].erase(b);
-    rrm[a_left].insert(b);
+    assert(contains(rai.rev_leftfix[b_left], b));
+    rai.rev_leftfix[b_left].erase(b);
+    rai.rev_leftfix[a_left].insert(b);
 
     a_left.leftfix_report = new_report;
     b_left.leftfix_report = new_report;
@@ -886,15 +955,15 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
     updateEdgeTops(g, a, a_top_map);
     updateEdgeTops(g, b, b_top_map);
 
-    pruneUnusedTops(castle, g, rrm[a_left]);
+    pruneUnusedTops(castle, g, rai.rev_leftfix[a_left]);
     return true;
 }
 
 static
-bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
+bool attemptRoseCastleMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                             RoseVertex b, bool trivialCasesOnly,
-                            revRoseMap &rrm) {
-    RoseGraph &g = tbi.g;
+                            RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     left_id a_left_id(a_left);
@@ -912,28 +981,28 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     if (&a_castle == &b_castle) {
         DEBUG_PRINTF("castles are the same\n");
-        return mergeSameCastle(tbi, a, b, rrm);
+        return mergeSameCastle(build, a, b, rai);
     }
 
     if (is_equal(a_castle, a_left.leftfix_report, b_castle,
                  b_left.leftfix_report)) {
         DEBUG_PRINTF("castles are equiv with respect to reports\n");
-        if (rrm[a_left_id].size() == 1) {
+        if (rai.rev_leftfix[a_left_id].size() == 1) {
             /* nobody else is using a_castle */
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
-            pruneUnusedTops(b_castle, g, rrm[b_left_id]);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
+            pruneUnusedTops(b_castle, g, rai.rev_leftfix[b_left_id]);
             b_left.castle = a_left.castle;
             b_left.leftfix_report = a_left.leftfix_report;
             DEBUG_PRINTF("OK -> only user of a_castle\n");
             return true;
         }
 
-        if (rrm[b_left_id].size() == 1) {
+        if (rai.rev_leftfix[b_left_id].size() == 1) {
             /* nobody else is using b_castle */
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
-            pruneUnusedTops(a_castle, g, rrm[a_left_id]);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
+            pruneUnusedTops(a_castle, g, rai.rev_leftfix[a_left_id]);
             a_left.castle = b_left.castle;
             a_left.leftfix_report = b_left.leftfix_report;
             DEBUG_PRINTF("OK -> only user of b_castle\n");
@@ -942,32 +1011,32 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
         if (preds_same) {
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[b_h] can handle a_h */
-            for (auto v : rrm[b_left_id]) {
-                if (!mergeableRoseVertices(tbi, a, v)) {
+             * check that all the literals in rev_leftfix[b_h] can handle a_h */
+            for (auto v : rai.rev_leftfix[b_left_id]) {
+                if (!mergeableRoseVertices(build, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
 
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
-            pruneUnusedTops(a_castle, g, rrm[a_left_id]);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
+            pruneUnusedTops(a_castle, g, rai.rev_leftfix[a_left_id]);
             a_left.castle = b_left.castle;
             a_left.leftfix_report = b_left.leftfix_report;
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_1:
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[a_h] can handle b_h */
-            for (auto v : rrm[a_left_id]) {
-                if (!mergeableRoseVertices(tbi, v, b)) {
+             * check that all the literals in rev_leftfix[a_h] can handle b_h */
+            for (auto v : rai.rev_leftfix[a_left_id]) {
+                if (!mergeableRoseVertices(build, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
 
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
-            pruneUnusedTops(b_castle, g, rrm[b_left_id]);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
+            pruneUnusedTops(b_castle, g, rai.rev_leftfix[b_left_id]);
             b_left.castle = a_left.castle;
             b_left.leftfix_report = a_left.leftfix_report;
             DEBUG_PRINTF("OK -> same preds ???\n");
@@ -978,15 +1047,15 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         /* we need to create a new graph as there may be other people
          * using b_left and it would be bad if a's preds started triggering it
          */
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         shared_ptr<CastleProto> new_castle = make_shared<CastleProto>(a_castle);
         pruneCastle(*new_castle, a_left.leftfix_report);
         setReports(*new_castle, new_report);
 
-        rrm[a_left_id].erase(a);
-        rrm[b_left_id].erase(b);
-        pruneUnusedTops(*a_left.castle, g, rrm[a_left_id]);
-        pruneUnusedTops(*b_left.castle, g, rrm[b_left_id]);
+        rai.rev_leftfix[a_left_id].erase(a);
+        rai.rev_leftfix[b_left_id].erase(b);
+        pruneUnusedTops(*a_left.castle, g, rai.rev_leftfix[a_left_id]);
+        pruneUnusedTops(*b_left.castle, g, rai.rev_leftfix[b_left_id]);
 
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
@@ -994,9 +1063,9 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         b_left.castle = new_castle;
 
         assert(a_left == b_left);
-        rrm[a_left].insert(a);
-        rrm[a_left].insert(b);
-        pruneUnusedTops(*new_castle, g, rrm[a_left]);
+        rai.rev_leftfix[a_left].insert(a);
+        rai.rev_leftfix[a_left].insert(b);
+        pruneUnusedTops(*new_castle, g, rai.rev_leftfix[a_left]);
         return true;
     }
 
@@ -1008,27 +1077,27 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     // Only infixes. Prefixes require special care when doing non-trivial
     // merges.
-    if (!tbi.isNonRootSuccessor(a) || !tbi.isNonRootSuccessor(b)) {
+    if (!build.isNonRootSuccessor(a) || !build.isNonRootSuccessor(b)) {
         return false;
     }
 
-    set<RoseVertex> &b_verts = rrm[b_left_id];
+    set<RoseVertex> &b_verts = rai.rev_leftfix[b_left_id];
     set<RoseVertex> aa;
     aa.insert(a);
 
-    if (!mergeableRoseVertices(tbi, aa, b_verts)) {
+    if (!mergeableRoseVertices(build, aa, b_verts)) {
         DEBUG_PRINTF("vertices not mergeable\n");
         return false;
     }
 
-    if (!tbi.cc.grey.roseMultiTopRoses || !tbi.cc.grey.allowCastle) {
+    if (!build.cc.grey.roseMultiTopRoses || !build.cc.grey.allowCastle) {
         return false;
     }
 
     DEBUG_PRINTF("merging into new castle\n");
 
     // Clone new castle with a's repeats in it, set to a new report.
-    ReportID new_report = tbi.getNewNfaReport();
+    ReportID new_report = build.getNewNfaReport();
     shared_ptr<CastleProto> m_castle = make_shared<CastleProto>(a_castle);
     pruneCastle(*m_castle, a_left.leftfix_report);
     setReports(*m_castle, new_report);
@@ -1070,10 +1139,10 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     DEBUG_PRINTF("merged into castle containing %zu repeats\n",
                   m_castle->repeats.size());
 
-    rrm[a_left_id].erase(a);
-    rrm[b_left_id].erase(b);
-    pruneUnusedTops(*a_left.castle, g, rrm[a_left_id]);
-    pruneUnusedTops(*b_left.castle, g, rrm[b_left_id]);
+    rai.rev_leftfix[a_left_id].erase(a);
+    rai.rev_leftfix[b_left_id].erase(b);
+    pruneUnusedTops(*a_left.castle, g, rai.rev_leftfix[a_left_id]);
+    pruneUnusedTops(*b_left.castle, g, rai.rev_leftfix[b_left_id]);
 
     a_left.castle = m_castle;
     a_left.leftfix_report = new_report;
@@ -1081,17 +1150,17 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     b_left.leftfix_report = new_report;
 
     assert(a_left == b_left);
-    rrm[a_left].insert(a);
-    rrm[a_left].insert(b);
-    pruneUnusedTops(*m_castle, g, rrm[a_left]);
+    rai.rev_leftfix[a_left].insert(a);
+    rai.rev_leftfix[a_left].insert(b);
+    pruneUnusedTops(*m_castle, g, rai.rev_leftfix[a_left]);
     return true;
 }
 
 static
-bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
+bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                            RoseVertex b, bool trivialCasesOnly,
-                           revRoseMap &rrm) {
-    RoseGraph &g = tbi.g;
+                           RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     left_id a_left_id(a_left);
@@ -1108,72 +1177,74 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         DEBUG_PRINTF("OK -> same actual holder\n");
         ReportID a_oldreport = a_left.leftfix_report;
         ReportID b_oldreport = b_left.leftfix_report;
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         duplicateReport(*a_h, a_left.leftfix_report, new_report);
         duplicateReport(*b_h, b_left.leftfix_report, new_report);
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
-        pruneReportIfUnused(tbi, b_h, rrm[b_left_id], a_oldreport);
-        pruneReportIfUnused(tbi, b_h, rrm[b_left_id], b_oldreport);
-        pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+        pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id],
+                            a_oldreport);
+        pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id],
+                            b_oldreport);
+        pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
         assert(a_left == b_left);
         return true;
     }
 
     /* if it is the same graph, it is also fairly easy */
     if (is_equal(*a_h, a_left.leftfix_report, *b_h, b_left.leftfix_report)) {
-        if (rrm[a_left_id].size() == 1) {
+        if (rai.rev_leftfix[a_left_id].size() == 1) {
             /* nobody else is using a_h */
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
             b_left.graph = a_h;
             b_left.leftfix_report = a_left.leftfix_report;
-            pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+            pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
             DEBUG_PRINTF("OK -> only user of a_h\n");
             return true;
         }
 
-        if (rrm[b_left_id].size() == 1) {
+        if (rai.rev_leftfix[b_left_id].size() == 1) {
             /* nobody else is using b_h */
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
             a_left.graph = b_h;
             a_left.leftfix_report = b_left.leftfix_report;
-            pruneUnusedTops(*a_h, g, rrm[a_left_id]);
+            pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
             DEBUG_PRINTF("OK -> only user of b_h\n");
             return true;
         }
 
         if (preds_same) {
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[b_h] can handle a_h */
-            for (auto v : rrm[b_left_id]) {
-                if (!mergeableRoseVertices(tbi, a, v)) {
+             * check that all the literals in rev_leftfix[b_h] can handle a_h */
+            for (auto v : rai.rev_leftfix[b_left_id]) {
+                if (!mergeableRoseVertices(build, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
 
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
             a_left.graph = b_h;
             a_left.leftfix_report = b_left.leftfix_report;
-            pruneUnusedTops(*a_h, g, rrm[a_left_id]);
+            pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_1:
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[a_h] can handle b_h */
-            for (auto v : rrm[a_left_id]) {
-                if (!mergeableRoseVertices(tbi, v, b)) {
+             * check that all the literals in rev_leftfix[a_h] can handle b_h */
+            for (auto v : rai.rev_leftfix[a_left_id]) {
+                if (!mergeableRoseVertices(build, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
 
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
             b_left.graph = a_h;
             b_left.leftfix_report = a_left.leftfix_report;
-            pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+            pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_2:;
@@ -1182,25 +1253,24 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         /* we need to create a new graph as there may be other people
          * using b_left and it would be bad if a's preds started triggering it
          */
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         shared_ptr<NGHolder> new_graph = cloneHolder(*b_h);
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
-        pruneReportIfUnused(tbi, new_graph, set<NFAVertex>(),
-                            b_left.leftfix_report);
+        pruneAllOtherReports(*new_graph, new_report);
 
-        rrm[a_left_id].erase(a);
-        rrm[b_left_id].erase(b);
-        pruneUnusedTops(*a_h, g, rrm[a_left_id]);
-        pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+        rai.rev_leftfix[a_left_id].erase(a);
+        rai.rev_leftfix[b_left_id].erase(b);
+        pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
+        pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
 
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
         a_left.graph = new_graph;
         b_left.graph = new_graph;
 
-        rrm[a_left].insert(a);
-        rrm[a_left].insert(b);
-        pruneUnusedTops(*new_graph, g, rrm[a_left]);
+        rai.rev_leftfix[a_left].insert(a);
+        rai.rev_leftfix[a_left].insert(b);
+        pruneUnusedTops(*new_graph, g, rai.rev_leftfix[a_left]);
         return true;
     }
 
@@ -1212,23 +1282,23 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     // Only infixes. Prefixes require special care when doing non-trivial
     // merges.
-    if (!tbi.isNonRootSuccessor(a) || !tbi.isNonRootSuccessor(b)) {
+    if (!build.isNonRootSuccessor(a) || !build.isNonRootSuccessor(b)) {
         return false;
     }
 
     DEBUG_PRINTF("attempting merge of roses on vertices %zu and %zu\n",
                  g[a].idx, g[b].idx);
 
-    set<RoseVertex> &b_verts = rrm[b_left];
+    set<RoseVertex> &b_verts = rai.rev_leftfix[b_left];
     set<RoseVertex> aa;
     aa.insert(a);
 
-    if (!mergeableRoseVertices(tbi, aa, b_verts)) {
+    if (!mergeableRoseVertices(build, aa, b_verts)) {
         DEBUG_PRINTF("vertices not mergeable\n");
         return false;
     }
 
-    if (!tbi.cc.grey.roseMultiTopRoses) {
+    if (!build.cc.grey.roseMultiTopRoses) {
         return false;
     }
 
@@ -1238,10 +1308,10 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     /* We need to allocate a new report id because */
     ReportID a_oldreport = a_left.leftfix_report;
     ReportID b_oldreport = b_left.leftfix_report;
-    ReportID new_report = tbi.getNewNfaReport();
+    ReportID new_report = build.getNewNfaReport();
     duplicateReport(*b_h, b_left.leftfix_report, new_report);
     b_left.leftfix_report = new_report;
-    pruneReportIfUnused(tbi, b_h, rrm[b_left_id], b_oldreport);
+    pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
 
     NGHolder victim;
     cloneHolder(victim, *a_h);
@@ -1265,7 +1335,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     assert(victim.kind == b_h->kind);
     assert(!generates_callbacks(*b_h));
 
-    if (!mergeNfaPair(victim, *b_h, nullptr, tbi.cc)) {
+    if (!mergeNfaPair(victim, *b_h, nullptr, build.cc)) {
         DEBUG_PRINTF("merge failed\n");
         // Restore in-edge properties.
         for (const auto &e : in_edges_range(a, g)) {
@@ -1282,22 +1352,22 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     a_left.graph = b_h;
     a_left.leftfix_report = new_report;
 
-    assert(contains(rrm[a_left_id], a));
-    assert(contains(rrm[b_left_id], b));
-    rrm[a_left_id].erase(a);
-    rrm[b_left_id].insert(a);
+    assert(contains(rai.rev_leftfix[a_left_id], a));
+    assert(contains(rai.rev_leftfix[b_left_id], b));
+    rai.rev_leftfix[a_left_id].erase(a);
+    rai.rev_leftfix[b_left_id].insert(a);
 
-    pruneUnusedTops(*a_h, g, rrm[a_left_id]);
-    pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+    pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
+    pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
 
     // Prune A's report from its old prefix if it was only used by A.
-    pruneReportIfUnused(tbi, a_h, rrm[a_left_id], a_oldreport);
+    pruneReportIfUnused(build, a_h, rai.rev_leftfix[a_left_id], a_oldreport);
 
-    reduceImplementableGraph(*b_h, SOM_NONE, nullptr, tbi.cc);
+    reduceImplementableGraph(*b_h, SOM_NONE, nullptr, build.cc);
 
     assert(roseHasTops(g, a));
     assert(roseHasTops(g, b));
-    assert(isImplementableNFA(*b_h, nullptr, tbi.cc));
+    assert(isImplementableNFA(*b_h, nullptr, build.cc));
     return true;
 }
 
@@ -1305,13 +1375,14 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 // the two LeftEngInfo structures to be the same. Returns false if the merge
 // is not possible.
 static
-bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
-                      RoseVertex b, bool trivialCasesOnly, revRoseMap &rrm) {
+bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
+                      RoseVertex b, bool trivialCasesOnly,
+                      RoseAliasingInfo &rai) {
     DEBUG_PRINTF("attempting rose merge, vertices a=%zu, b=%zu\n",
-                  tbi.g[a].idx, tbi.g[b].idx);
+                  build.g[a].idx, build.g[b].idx);
     assert(a != b);
 
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
 
@@ -1335,8 +1406,8 @@ bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     }
 
     // Only non-transients for the moment.
-    if (contains(tbi.transient, a_left_id) ||
-        contains(tbi.transient, b_left_id)) {
+    if (contains(build.transient, a_left_id) ||
+        contains(build.transient, b_left_id)) {
         return false;
     }
 
@@ -1350,13 +1421,13 @@ bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     assert(roseHasTops(g, b));
 
     if (a_left_id.graph() && b_left_id.graph()) {
-        return attemptRoseGraphMerge(tbi, preds_same, a, b, trivialCasesOnly,
-                                     rrm);
+        return attemptRoseGraphMerge(build, preds_same, a, b, trivialCasesOnly,
+                                     rai);
     }
 
     if (a_left_id.castle() && b_left_id.castle()) {
-        return attemptRoseCastleMerge(tbi, preds_same, a, b, trivialCasesOnly,
-                                      rrm);
+        return attemptRoseCastleMerge(build, preds_same, a, b, trivialCasesOnly,
+                                      rai);
     }
 
     return false;
@@ -1481,8 +1552,8 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
 }
 
 static
-vector<vector<RoseVertex>> splitDiamondMergeBuckets(CandidateSet &candidates,
-                                                    const RoseBuildImpl &build) {
+vector<vector<RoseVertex>>
+splitDiamondMergeBuckets(CandidateSet &candidates, const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
 
     vector<vector<RoseVertex>> buckets(1);
@@ -1499,19 +1570,20 @@ vector<vector<RoseVertex>> splitDiamondMergeBuckets(CandidateSet &candidates,
 
     return buckets;
 }
+
 static never_inline
-void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
+void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                       vector<RoseVertex> *dead, bool mergeRoses,
-                      revRoseMap &rrm) {
+                      RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
     if (candidates.empty()) {
         return;
     }
 
     /* Vertices may only be diamond merged with others in the same bucket */
-    auto cand_buckets = splitDiamondMergeBuckets(candidates, tbi);
+    auto cand_buckets = splitDiamondMergeBuckets(candidates, build);
 
     for (const vector<RoseVertex> &siblings : cand_buckets) {
         for (auto it = siblings.begin(); it != siblings.end();) {
@@ -1525,7 +1597,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                 RoseVertex b = *jt;
                 assert(contains(candidates, b));
 
-                if (!sameRoleProperties(tbi, a, b)) {
+                if (!sameRoleProperties(build, rai, a, b)) {
                     DEBUG_PRINTF("diff role prop\n");
                     continue;
                 }
@@ -1536,23 +1608,23 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                  * so we still have to checks successors and predecessors. */
 
                 if (!sameSuccessors(a, b, g)
-                    || !sameRightRoleProperties(tbi, a, b)
+                    || !sameRightRoleProperties(build, a, b)
                     || !samePredecessors(a, b, g)) {
                     DEBUG_PRINTF("not diamond\n");
                     continue;
                 }
 
-                if (!canMergeLiterals(a, b, tbi)) {
+                if (!canMergeLiterals(a, b, build)) {
                     DEBUG_PRINTF("incompatible lits\n");
                     continue;
                 }
 
-                if (!attemptRoseMerge(tbi, true, a, b, !mergeRoses, rrm)) {
+                if (!attemptRoseMerge(build, true, a, b, !mergeRoses, rai)) {
                     DEBUG_PRINTF("rose fail\n");
                     continue;
                 }
 
-                mergeVerticesDiamond(a, b, tbi, rrm);
+                mergeVerticesDiamond(a, b, build, rai);
                 dead->push_back(a);
                 candidates.erase(a);
                 break; // next a
@@ -1568,6 +1640,7 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
                           vector<RoseVertex>::iterator it,
                           const vector<RoseVertex>::iterator &end,
                           const RoseVertex a, const RoseBuildImpl &build,
+                          const RoseAliasingInfo &rai,
                           const CandidateSet &candidates) {
     const RoseGraph &g = build.g;
 
@@ -1581,7 +1654,7 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
             continue;
         }
 
-        if (!sameRoleProperties(build, a, b)) {
+        if (!sameRoleProperties(build, rai, a, b)) {
             continue;
         }
 
@@ -1611,10 +1684,10 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
 }
 
 static never_inline
-void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
-                   vector<RoseVertex> *dead, revRoseMap &rrm) {
+void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
+                   vector<RoseVertex> *dead, RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin (%zu)\n", candidates.size());
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     vector<RoseVertex> siblings;
 
     CandidateSet::iterator it = candidates.begin();
@@ -1629,11 +1702,11 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         assert(!g[a].literals.empty());
         u32 lit_id = *g[a].literals.begin();
-        const auto &verts = tbi.literal_info.at(lit_id).vertices;
-        RoseVertex pred = pickPred(a, g, tbi);
+        const auto &verts = build.literal_info.at(lit_id).vertices;
+        RoseVertex pred = pickPred(a, g, build);
 
         siblings.clear();
-        if (pred == RoseGraph::null_vertex() || tbi.isAnyStart(pred) ||
+        if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred) ||
                     hasGreaterOutDegree(verts.size(), pred, g)) {
             // Select sibling from amongst the vertices that share a literal.
             siblings.insert(siblings.end(), verts.begin(), verts.end());
@@ -1645,20 +1718,20 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         sort(siblings.begin(), siblings.end(), VertexIndexComp(g));
 
-        auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a, tbi,
-                                       candidates);
+        auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a,
+                                       build, rai, candidates);
         if (jt == siblings.end()) {
             continue;
         }
 
         RoseVertex b = *jt;
 
-        if (!attemptRoseMerge(tbi, true, a, b, 0, rrm)) {
+        if (!attemptRoseMerge(build, true, a, b, 0, rai)) {
             DEBUG_PRINTF("rose fail\n");
             continue;
         }
 
-        mergeVertices(a, b, tbi, rrm);
+        mergeVerticesLeft(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -1693,6 +1766,7 @@ vector<RoseVertex>::const_iterator findRightMergeSibling(
                            vector<RoseVertex>::const_iterator it,
                            const vector<RoseVertex>::const_iterator &end,
                            const RoseVertex a, const RoseBuildImpl &build,
+                           const RoseAliasingInfo &rai,
                            const CandidateSet &candidates) {
     const RoseGraph &g = build.g;
 
@@ -1706,7 +1780,7 @@ vector<RoseVertex>::const_iterator findRightMergeSibling(
             continue;
         }
 
-        if (!sameRoleProperties(build, a, b)) {
+        if (!sameRoleProperties(build, rai, a, b)) {
             continue;
         }
 
@@ -1764,10 +1838,10 @@ void split(map<RoseVertex, size_t> &keys, size_t *next_key, Iter it,
 }
 
 static never_inline
-void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
-                                 map<size_t, vector<RoseVertex> > &sibling_cache,
+void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &build,
+                                 map<size_t, vector<RoseVertex>> &sibling_cache,
                                  map<RoseVertex, size_t> &keys_ext) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
     size_t next_key = 1;
     map<RoseVertex, size_t> keys;
@@ -1783,7 +1857,7 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
         assert(!g[a].literals.empty());
         u32 lit_id = *g[a].literals.begin();
         RoseVertex succ = pickSucc(a, g);
-        const auto &verts = tbi.literal_info.at(lit_id).vertices;
+        const auto &verts = build.literal_info.at(lit_id).vertices;
         if (succ != RoseGraph::null_vertex() &&
                 !hasGreaterInDegree(verts.size(), succ, g)) {
             if (!done_succ.insert(succ).second) {
@@ -1818,28 +1892,28 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
     }
 
     for (auto &siblings : sibling_cache | map_values) {
-        sort(siblings.begin(), siblings.end(), VertexIndexComp(tbi.g));
+        sort(siblings.begin(), siblings.end(), VertexIndexComp(build.g));
     }
 }
 
 static
 const vector<RoseVertex> &getCandidateRightSiblings(
-                         const map<size_t, vector<RoseVertex> > &sibling_cache,
+                         const map<size_t, vector<RoseVertex>> &sibling_cache,
                          map<RoseVertex, size_t> &keys, RoseVertex a) {
     size_t key = keys.at(a);
     return sibling_cache.at(key);
 }
 
 static never_inline
-void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
+void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                     vector<RoseVertex> *dead, bool mergeRoses,
-                    revRoseMap &rrm) {
+                    RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
 
-    map<size_t, vector<RoseVertex> > sibling_cache;
+    map<size_t, vector<RoseVertex>> sibling_cache;
     map<RoseVertex, size_t> keys;
 
-    buildCandidateRightSiblings(candidates, tbi, sibling_cache, keys);
+    buildCandidateRightSiblings(candidates, build, sibling_cache, keys);
 
     CandidateSet::iterator it = candidates.begin();
     while (it != candidates.end()) {
@@ -1856,11 +1930,12 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         auto jt = siblings.begin();
         while (jt != siblings.end()) {
-            jt = findRightMergeSibling(jt, siblings.end(), a, tbi, candidates);
+            jt = findRightMergeSibling(jt, siblings.end(), a, build, rai,
+                                       candidates);
             if (jt == siblings.end()) {
                 break;
             }
-            if (attemptRoseMerge(tbi, false, a, *jt, !mergeRoses, rrm)) {
+            if (attemptRoseMerge(build, false, a, *jt, !mergeRoses, rai)) {
                 break;
             }
             ++jt;
@@ -1871,7 +1946,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
         }
 
         RoseVertex b = *jt;
-        mergeVertices(a, b, tbi, rrm);
+        mergeVerticesRight(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -1947,10 +2022,9 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
         return;
     }
 
-    revRoseMap rrm;
-
     DEBUG_PRINTF("doing role aliasing mr=%d\n", (int)mergeRoses);
-    populateRevRoseMap(g, &rrm);
+
+    RoseAliasingInfo rai(build);
 
     mergeRoses &= cc.grey.mergeRose & cc.grey.roseMergeRosesDuringAliasing;
 
@@ -1963,8 +2037,8 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
     size_t old_dead_size = 0;
     do {
         old_dead_size = dead.size();
-        leftMergePass(candidates, build, &dead, rrm);
-        rightMergePass(candidates, build, &dead, mergeRoses, rrm);
+        leftMergePass(candidates, build, &dead, rai);
+        rightMergePass(candidates, build, &dead, mergeRoses, rai);
     } while (old_dead_size != dead.size());
 
     /* Diamond merge passes cannot create extra merges as they require the same
@@ -1972,7 +2046,7 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
      * to a merge to different pred/succ before a diamond merge, it will still
      * be afterwards. */
     filterDiamondCandidates(g, candidates);
-    diamondMergePass(candidates, build, &dead, mergeRoses, rrm);
+    diamondMergePass(candidates, build, &dead, mergeRoses, rai);
 
     DEBUG_PRINTF("killed %zu vertices\n", dead.size());
     build.removeVertices(dead);
diff --git a/src/rose/rose_build_util.h b/src/rose/rose_build_util.h
index 536b031a..85cfc010 100644
--- a/src/rose/rose_build_util.h
+++ b/src/rose/rose_build_util.h
@@ -36,6 +36,9 @@
 
 namespace ue2 {
 
+/** Max allowed width for transient graphs in block mode */
+#define ROSE_BLOCK_TRANSIENT_MAX_WIDTH 255U
+
 // Comparator for vertices using their index property.
 struct VertexIndexComp {
     VertexIndexComp(const RoseGraph &gg) : g(gg) {}
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index f6badd1b..a3d00943 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -40,6 +40,7 @@
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_dump_api.h"
 #include "nfa/nfa_internal.h"
+#include "nfa/nfa_kind.h"
 #include "util/dump_charclass.h"
 #include "util/multibit_internal.h"
 #include "util/multibit.h"
@@ -253,7 +254,9 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(CHECK_LIT_EARLY) {}
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                os << "    min_offset " << ri->min_offset << endl;
+            }
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_GROUPS) {
@@ -288,6 +291,31 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK) {
+                os << "    and_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->and_mask << std::dec << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->cmp_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                os << "    and_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->and_mask} << std::dec
+                   << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->cmp_mask} << std::dec
+                   << endl;
+                os << "    negation " << u32{ri->negation} << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 os << "    queue " << ri->queue << endl;
                 os << "    lag " << ri->lag << endl;
@@ -310,6 +338,11 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(RECORD_ANCHORED) {
+                os << "    id " << ri->id << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CATCH_UP) {}
             PROGRAM_NEXT_INSTRUCTION
 
@@ -474,6 +507,17 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(ENGINES_EOD) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) { return; }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -529,7 +573,7 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
     ofstream os(filename);
     const char *base = (const char *)t;
 
-    os << "Unconditional EOD Program:" << endl;
+    os << "EOD Program:" << endl;
 
     if (t->eodProgramOffset) {
         dumpProgram(os, t, base + t->eodProgramOffset);
@@ -538,14 +582,6 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
         os << "<No EOD Program>" << endl;
     }
 
-    os << "Sparse Iter EOD Program:" << endl;
-
-    if (t->eodIterProgramOffset) {
-        dumpProgram(os, t, base + t->eodIterProgramOffset);
-    } else {
-        os << "<No EOD Iter Program>" << endl;
-    }
-
     os.close();
 }
 
@@ -600,6 +636,9 @@ void dumpNfaNotes(ofstream &fout, const RoseEngine *t, const NFA *n) {
     }
 
     const LeftNfaInfo *left = getLeftInfoByQueue(t, qindex);
+    if (left->eager) {
+        fout << "eager ";
+    }
     if (left->transient) {
         fout << "transient " << (u32)left->transient << " ";
     }
@@ -659,6 +698,76 @@ void dumpComponentInfo(const RoseEngine *t, const string &base) {
     }
 }
 
+
+static
+void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
+    FILE *f = fopen((base +"rose_components.csv").c_str(), "w");
+
+    fprintf(f, "Index, Offset,Engine Type,States,Stream State,Bytecode Size,"
+            "Kind,Notes\n");
+
+    for (u32 i = 0; i < t->queueCount; i++) {
+        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
+        const NFA *n = getNfaByInfo(t, nfa_info);
+        nfa_kind kind;
+        stringstream notes;
+
+        if (i < t->outfixBeginQueue) {
+            notes << "chained;";
+        }
+
+        if (nfa_info->eod) {
+            notes << "eod;";
+        }
+
+        if (i < t->outfixEndQueue) {
+            kind = NFA_OUTFIX;
+        } else if (i < t->leftfixBeginQueue) {
+            kind = NFA_SUFFIX;
+        } else {
+            const LeftNfaInfo *left = getLeftInfoByQueue(t, i);
+            if (left->eager) {
+                notes << "eager;";
+            }
+            if (left->transient) {
+                notes << "transient " << (u32)left->transient << ";";
+            }
+            if (left->infix) {
+                kind = NFA_INFIX;
+                u32 maxQueueLen = left->maxQueueLen;
+                if (maxQueueLen != (u32)(-1)) {
+                    notes << "maxqlen=" << maxQueueLen << ";";
+                }
+            } else {
+                kind = NFA_PREFIX;
+            }
+            notes << "maxlag=" << left->maxLag << ";";
+            if (left->stopTable) {
+                notes << "miracles;";
+            }
+            if (left->countingMiracleOffset) {
+                auto cm = (const RoseCountingMiracle *)
+                    ((const char *)t + left->countingMiracleOffset);
+                notes << "counting_miracle:" << (int)cm->count
+                      << (cm->shufti ? "s" : "v") << ";";
+            }
+            if (nfaSupportsZombie(n)) {
+                notes << " zombie;";
+            }
+            if (left->eod_check) {
+            notes << "left_eod;";
+            }
+        }
+
+        fprintf(f, "%u,%zd,\"%s\",%u,%u,%u,%s,%s\n", i,
+                (const char *)n - (const char *)t, describe(*n).c_str(),
+                n->nPositions, n->streamStateSize, n->length,
+                to_string(kind).c_str(), notes.str().c_str());
+    }
+    fclose(f);
+}
+
+
 static
 void dumpExhaust(const RoseEngine *t, const string &base) {
     stringstream sstxt;
@@ -710,7 +819,7 @@ void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
@@ -770,7 +879,7 @@ void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
@@ -801,7 +910,7 @@ void dumpAnchored(const RoseEngine *t, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
@@ -906,8 +1015,7 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
             t->lookaroundTableOffset - t->lookaroundReachOffset);
 
     fprintf(f, "state space required : %u bytes\n", t->stateOffsets.end);
-    fprintf(f, " - history buffer    : %u bytes (+1 for len)\n",
-            t->historyRequired);
+    fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
     fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
     fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
     fprintf(f, " - floating matcher  : %u bytes\n", t->floatingStreamState);
@@ -925,6 +1033,7 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
     fprintf(f, "\n");
 
     fprintf(f, "initial groups       : 0x%016llx\n", t->initialGroups);
+    fprintf(f, "floating groups      : 0x%016llx\n", t->floating_group_mask);
     fprintf(f, "handled key count    : %u\n", t->handledKeyCount);
     fprintf(f, "\n");
 
@@ -1012,15 +1121,13 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, activeArrayCount);
     DUMP_U32(t, activeLeftCount);
     DUMP_U32(t, queueCount);
+    DUMP_U32(t, eagerIterOffset);
     DUMP_U32(t, handledKeyCount);
     DUMP_U32(t, leftOffset);
     DUMP_U32(t, roseCount);
     DUMP_U32(t, lookaroundTableOffset);
     DUMP_U32(t, lookaroundReachOffset);
     DUMP_U32(t, eodProgramOffset);
-    DUMP_U32(t, eodIterProgramOffset);
-    DUMP_U32(t, eodIterOffset);
-    DUMP_U32(t, eodNfaIterOffset);
     DUMP_U32(t, lastByteHistoryIterOffset);
     DUMP_U32(t, minWidth);
     DUMP_U32(t, minWidthExcludingBoundaries);
@@ -1033,6 +1140,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, floatingMinLiteralMatchOffset);
     DUMP_U32(t, nfaInfoOffset);
     DUMP_U64(t, initialGroups);
+    DUMP_U64(t, floating_group_mask);
     DUMP_U32(t, size);
     DUMP_U32(t, delay_count);
     DUMP_U32(t, delay_base_id);
@@ -1068,7 +1176,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, ematcherRegionSize);
     DUMP_U32(t, somRevCount);
     DUMP_U32(t, somRevOffsetOffset);
-    DUMP_U32(t, group_weak_end);
     DUMP_U32(t, floatingStreamState);
     fprintf(f, "}\n");
     fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
@@ -1077,6 +1184,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
 void roseDumpComponents(const RoseEngine *t, bool dump_raw,
                         const string &base) {
     dumpComponentInfo(t, base);
+    dumpComponentInfoCsv(t, base);
     dumpNfas(t, dump_raw, base);
     dumpAnchored(t, base);
     dumpRevComponentInfo(t, base);
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index b0ac8d11..6abe629b 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@ namespace ue2 {
 struct CastleProto;
 struct raw_dfa;
 struct raw_som_dfa;
+struct TamaProto;
 
 /** \brief Table type for a literal. */
 enum rose_literal_table {
@@ -82,6 +83,7 @@ struct LeftEngInfo {
     std::shared_ptr<CastleProto> castle;
     std::shared_ptr<raw_dfa> dfa;
     std::shared_ptr<raw_som_dfa> haig;
+    std::shared_ptr<TamaProto> tamarama;
     u32 lag = 0U;
     ReportID leftfix_report = MO_INVALID_IDX;
     depth dfa_min_width = 0;
@@ -92,6 +94,7 @@ struct LeftEngInfo {
             && other.castle == castle
             && other.dfa == dfa
             && other.haig == haig
+            && other.tamarama == tamarama
             && other.lag == lag
             && other.leftfix_report == leftfix_report;
     }
@@ -104,6 +107,7 @@ struct LeftEngInfo {
         ORDER_CHECK(castle);
         ORDER_CHECK(dfa);
         ORDER_CHECK(haig);
+        ORDER_CHECK(tamarama);
         ORDER_CHECK(lag);
         ORDER_CHECK(leftfix_report);
         return false;
@@ -121,6 +125,7 @@ struct RoseSuffixInfo {
     std::shared_ptr<CastleProto> castle;
     std::shared_ptr<raw_som_dfa> haig;
     std::shared_ptr<raw_dfa> rdfa;
+    std::shared_ptr<TamaProto> tamarama;
     depth dfa_min_width = 0;
     depth dfa_max_width = depth::infinity();
 
@@ -128,7 +133,7 @@ struct RoseSuffixInfo {
     bool operator!=(const RoseSuffixInfo &b) const { return !(*this == b); }
     bool operator<(const RoseSuffixInfo &b) const;
     void reset(void);
-    operator bool() const { return graph || castle || haig || rdfa; }
+    operator bool() const { return graph || castle || haig || rdfa || tamarama; }
 };
 
 /** \brief Properties attached to each Rose graph vertex. */
diff --git a/src/rose/rose_in_dump.cpp b/src/rose/rose_in_dump.cpp
index 899e50c4..fbd6858b 100644
--- a/src/rose/rose_in_dump.cpp
+++ b/src/rose/rose_in_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,7 +51,7 @@ namespace ue2 {
 
 void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
                       const char *filename) {
-    if (!grey.dumpFlags) {
+    if (!(grey.dumpFlags & Grey::DUMP_INT_GRAPH)) {
         return;
     }
 
@@ -107,7 +107,8 @@ void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
                 size_t id = graph_ids.size();
                 graph_ids[&*ig[e].graph] = id;
             }
-            fprintf(f, "graph %zu", graph_ids[&*ig[e].graph]);
+            fprintf(f, "graph %zu\n%s", graph_ids[&*ig[e].graph],
+                    to_string(ig[e].graph->kind).c_str());
         }
         if (ig[e].haig) {
             fprintf(f, "haig ");
diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h
index 2c00a418..14d4d9b2 100644
--- a/src/rose/rose_in_graph.h
+++ b/src/rose/rose_in_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -106,6 +106,12 @@ public:
                                  ROSE_BOUND_INF);
     }
 
+    /* for when there is a suffix graph which handles the reports */
+    static RoseInVertexProps makeAcceptEod() {
+        return RoseInVertexProps(RIV_ACCEPT_EOD, ue2_literal(), 0,
+                                 ROSE_BOUND_INF);
+    }
+
     static RoseInVertexProps makeStart(bool anchored) {
         DEBUG_PRINTF("making %s\n", anchored ? "anchored start" : "start");
         if (anchored) {
diff --git a/src/rose/rose_in_util.h b/src/rose/rose_in_util.h
index 7c74554a..1f3c4ef7 100644
--- a/src/rose/rose_in_util.h
+++ b/src/rose/rose_in_util.h
@@ -46,6 +46,11 @@ void calcVertexOffsets(RoseInGraph &ig);
 enum nfa_kind whatRoseIsThis(const RoseInGraph &in, const RoseInEdge &e);
 void pruneUseless(RoseInGraph &g);
 
+inline
+bool is_any_accept(RoseInVertex v, const RoseInGraph &g) {
+    return g[v].type == RIV_ACCEPT || g[v].type == RIV_ACCEPT_EOD;
+}
+
 }
 
 #endif
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index bbe0b1b6..51913984 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -144,6 +144,7 @@ struct LeftNfaInfo {
     u32 stopTable; // stop table index, or ROSE_OFFSET_INVALID
     u8 transient; /**< 0 if not transient, else max width of transient prefix */
     char infix; /* TODO: make flags */
+    char eager; /**< nfa should be run eagerly to first match or death */
     char eod_check; /**< nfa is used by the event eod literal */
     u32 countingMiracleOffset; /** if not 0, offset to RoseCountingMiracle. */
     rose_group squash_mask; /* & mask applied when rose nfa dies */
@@ -155,8 +156,6 @@ struct NfaInfo {
     u32 fullStateOffset; /* offset in scratch, relative to ??? */
     u32 ekeyListOffset; /* suffix, relative to base of rose, 0 if no ekeys */
     u8 no_retrigger; /* TODO */
-    u8 only_external; /**< does not raise any som internal events or chained
-                       * rose events */
     u8 in_sbmatcher;  /**< this outfix should not be run in small-block
                        * execution, as it will be handled by the sbmatcher
                        * HWLM table. */
@@ -348,10 +347,15 @@ struct RoseEngine {
      * literals. */
     u32 litDelayRebuildProgramOffset;
 
-    /** \brief Offset of u32 array of program offsets for internal reports. */
+    /**
+     * \brief Offset of u32 array of program offsets for reports used by
+     * output-exposed engines.
+     */
     u32 reportProgramOffset;
 
-    /** \brief Number of programs for internal reports. */
+    /**
+     * \brief Number of programs for reports used by output-exposed engines.
+     */
     u32 reportProgramCount;
 
     /**
@@ -366,6 +370,9 @@ struct RoseEngine {
     u32 activeLeftCount; //number of nfas tracked in the active rose array
     u32 queueCount;      /**< number of nfa queues */
 
+    u32 eagerIterOffset; /**< offset to sparse iter for eager prefixes or 0 if
+                          * none */
+
     /** \brief Number of keys used by CHECK_SET_HANDLED instructions in role
      * programs. Used to size the handled_roles fatbit in scratch. */
     u32 handledKeyCount;
@@ -376,12 +383,7 @@ struct RoseEngine {
     u32 lookaroundReachOffset; /**< base of lookaround reach bitvectors (32
                                 * bytes each) */
 
-    u32 eodProgramOffset; //!< Unconditional EOD program, otherwise 0.
-    u32 eodIterProgramOffset; // or 0 if no eod iterator program
-    u32 eodIterOffset; // offset to EOD sparse iter or 0 if none
-
-    /** \brief Offset to sparse iter over outfix/suffix NFAs that accept EOD. */
-    u32 eodNfaIterOffset;
+    u32 eodProgramOffset; //!< EOD program, otherwise 0.
 
     u32 lastByteHistoryIterOffset; // if non-zero
 
@@ -406,6 +408,7 @@ struct RoseEngine {
                                         * table */
     u32 nfaInfoOffset; /* offset to the nfa info offset array */
     rose_group initialGroups;
+    rose_group floating_group_mask; /* groups that are used by the ftable */
     u32 size; // (bytes)
     u32 delay_count; /* number of delayed literal ids. */
     u32 delay_base_id; /* literal id of the first delayed literal.
@@ -431,7 +434,6 @@ struct RoseEngine {
     u32 ematcherRegionSize; /* max region size to pass to ematcher */
     u32 somRevCount; /**< number of som reverse nfas */
     u32 somRevOffsetOffset; /**< offset to array of offsets to som rev nfas */
-    u32 group_weak_end; /* end of weak groups, debugging only */
     u32 floatingStreamState; // size in bytes
 
     struct scatter_full_plan state_init;
@@ -468,17 +470,6 @@ const struct HWLM *getFLiteralMatcher(const struct RoseEngine *t) {
     return (const struct HWLM *)lt;
 }
 
-static really_inline
-const void *getELiteralMatcher(const struct RoseEngine *t) {
-    if (!t->ematcherOffset) {
-        return NULL;
-    }
-
-    const char *et = (const char *)t + t->ematcherOffset;
-    assert(ISALIGNED_N(et, 8));
-    return et;
-}
-
 static really_inline
 const void *getSBLiteralMatcher(const struct RoseEngine *t) {
     if (!t->sbmatcherOffset) {
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 01572dbd..545e190f 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -50,9 +50,12 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_BOUNDS,      //!< Bounds on distance from offset 0.
     ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled".
     ROSE_INSTR_CHECK_LOOKAROUND,  //!< Lookaround check.
+    ROSE_INSTR_CHECK_MASK,        //!< 8-bytes mask check.
+    ROSE_INSTR_CHECK_BYTE,        //!< Single Byte check.
     ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
     ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
     ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
+    ROSE_INSTR_RECORD_ANCHORED,   //!< Record an anchored literal match.
     ROSE_INSTR_CATCH_UP,          //!< Catch up engines, anchored matches.
     ROSE_INSTR_CATCH_UP_MPV,      //!< Catch up the MPV.
     ROSE_INSTR_SOM_ADJUST,        //!< Set SOM from a distance to EOM.
@@ -96,6 +99,17 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_STATE,       //!< Test a single bit in the state multibit.
     ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states.
     ROSE_INSTR_SPARSE_ITER_NEXT,  //!< Continue running sparse iter over states.
+
+    /** \brief Check outfixes and suffixes for EOD and fire reports if so. */
+    ROSE_INSTR_ENGINES_EOD,
+
+    /** \brief Catch up and check active suffixes for EOD and fire reports if
+     * so. */
+    ROSE_INSTR_SUFFIXES_EOD,
+
+    /** \brief Run the EOD-anchored HWLM literal matcher. */
+    ROSE_INSTR_MATCHER_EOD,
+
     ROSE_INSTR_END                //!< End of program.
 };
 
@@ -120,6 +134,7 @@ struct ROSE_STRUCT_CHECK_LIT_MASK {
 /** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LIT_EARLY {
     u8 code; //!< From enum RoseInstructionCode.
+    u32 min_offset; //!< Minimum offset for this literal.
 };
 
 /** Note: check failure will halt program. */
@@ -153,6 +168,24 @@ struct ROSE_STRUCT_CHECK_LOOKAROUND {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_MASK {
+    u8 code; //!< From enum roseInstructionCode.
+    u64a and_mask; //!< 64-bits and mask.
+    u64a cmp_mask; //!< 64-bits cmp mask.
+    u64a neg_mask; //!< 64-bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_BYTE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask; //!< 8-bits and mask.
+    u8 cmp_mask; //!< 8-bits cmp mask.
+    u8 negation; //!< Flag about negation.
+    s32 offset; //!< The relative offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_INFIX {
     u8 code; //!< From enum RoseInstructionCode.
     u32 queue; //!< Queue of leftfix to check.
@@ -175,6 +208,11 @@ struct ROSE_STRUCT_PUSH_DELAYED {
     u32 index; // Delay literal index (relative to first delay lit).
 };
 
+struct ROSE_STRUCT_RECORD_ANCHORED {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 id; //!< Literal ID.
+};
+
 struct ROSE_STRUCT_CATCH_UP {
     u8 code; //!< From enum RoseInstructionCode.
 };
@@ -351,6 +389,19 @@ struct ROSE_STRUCT_SPARSE_ITER_NEXT {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_ENGINES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+};
+
+struct ROSE_STRUCT_SUFFIXES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_MATCHER_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
 struct ROSE_STRUCT_END {
     u8 code; //!< From enum RoseInstructionCode.
 };
diff --git a/src/rose/runtime.h b/src/rose/runtime.h
index d4309bfb..60c7d34b 100644
--- a/src/rose/runtime.h
+++ b/src/rose/runtime.h
@@ -35,7 +35,6 @@
 
 #include "rose_internal.h"
 #include "scratch.h"
-#include "util/exhaust.h" // for isExhausted
 #include "util/partial_store.h"
 
 /*
@@ -56,6 +55,11 @@
 
 #define rose_inline really_inline
 
+/* Maximum offset that we will eagerly run prefixes to. Beyond this point, eager
+ * prefixes are always run in exactly the same way as normal prefixes. */
+#define EAGER_STOP_OFFSET 64
+
+
 static really_inline
 const void *getByOffset(const struct RoseEngine *t, u32 offset) {
     assert(offset < t->size);
@@ -108,39 +112,6 @@ const u8 *getLeftfixLagTableConst(const struct RoseEngine *t,
     return (const u8 *)(state + t->stateOffsets.leftfixLagTable);
 }
 
-static rose_inline
-char roseSuffixInfoIsExhausted(const struct RoseEngine *t,
-                               const struct NfaInfo *info,
-                               const char *exhausted) {
-    if (!info->ekeyListOffset) {
-        return 0;
-    }
-
-    DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset);
-
-    /* INVALID_EKEY terminated list */
-    const u32 *ekeys = (const u32 *)((const char *)t + info->ekeyListOffset);
-    while (*ekeys != INVALID_EKEY) {
-        DEBUG_PRINTF("check %u\n", *ekeys);
-        if (!isExhausted(t, exhausted, *ekeys)) {
-            DEBUG_PRINTF("not exhausted -> alive\n");
-            return 0;
-        }
-        ++ekeys;
-    }
-
-    DEBUG_PRINTF("all ekeys exhausted -> dead\n");
-    return 1;
-}
-
-static really_inline
-char roseSuffixIsExhausted(const struct RoseEngine *t, u32 qi,
-                           const char *exhausted) {
-    DEBUG_PRINTF("check queue %u\n", qi);
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-    return roseSuffixInfoIsExhausted(t, info, exhausted);
-}
-
 static really_inline
 u32 has_chained_nfas(const struct RoseEngine *t) {
     return t->outfixBeginQueue;
diff --git a/src/rose/stream.c b/src/rose/stream.c
index b08fe04d..b934f98f 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -31,13 +31,14 @@
 #include "infix.h"
 #include "match.h"
 #include "miracle.h"
+#include "program_runtime.h"
+#include "rose.h"
 #include "hwlm/hwlm.h"
 #include "nfa/mcclellan.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_internal.h"
 #include "util/fatbit.h"
-#include "rose.h"
 
 static rose_inline
 void runAnchoredTableStream(const struct RoseEngine *t, const void *atable,
@@ -422,8 +423,95 @@ void do_rebuild(const struct RoseEngine *t, const struct HWLM *ftable,
     assert(!can_stop_matching(scratch));
 }
 
+static rose_inline
+void runEagerPrefixesStream(const struct RoseEngine *t,
+                            struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset
+        || scratch->core_info.buf_offset >= EAGER_STOP_OFFSET) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u of %u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+        s64a loc = MIN(scratch->core_info.len,
+                       EAGER_STOP_OFFSET - scratch->core_info.buf_offset);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        if (scratch->core_info.buf_offset) {
+            s64a sp = left->transient ? -(s64a)scratch->core_info.hlen
+                                      : -(s64a)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (scratch->core_info.buf_offset + sp > 0) {
+                loadStreamState(nfa, q, sp);
+                /* if the leftfix fix is currently in a match state, we cannot
+                 * advance it. */
+                if (nfaInAnyAcceptState(nfa, q)) {
+                    continue;
+                }
+                pushQueueAt(q, 1, MQE_END, loc);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                pushQueueAt(q, 2, MQE_END, loc);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else {
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            pushQueueAt(q, 2, MQE_END, loc);
+            nfaQueueInitState(nfa, q);
+        }
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            /* unlike in block mode we cannot squash groups if there is no match
+             * in this block as we need the groups on for later stream writes */
+            /* TODO: investigate possibility of a method to suppress groups for
+             * a single stream block. */
+            DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
-    DEBUG_PRINTF("OH HAI\n");
+    DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset,
+                 scratch->core_info.buf_offset + (u64a)scratch->core_info.len);
     assert(t);
     assert(scratch->core_info.hbuf);
     assert(scratch->core_info.buf);
@@ -460,8 +548,8 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     tctxt->minMatchOffset = offset;
     tctxt->minNonMpvMatchOffset = offset;
     tctxt->next_mpv_offset = 0;
-    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu\n",
-                  scratch->core_info.hlen, scratch->core_info.len);
+    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n",
+                 scratch->core_info.hlen, scratch->core_info.len, tctxt->groups);
 
     fatbit_clear(scratch->aqa);
     scratch->al_log_sum = 0;
@@ -471,6 +559,8 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         streamInitSufPQ(t, state, scratch);
     }
 
+    runEagerPrefixesStream(t, scratch);
+
     u32 alen = t->anchoredDistance > offset ?
         MIN(length + offset, t->anchoredDistance) - offset : 0;
 
@@ -539,8 +629,9 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         }
 
         DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
-        hwlmExecStreaming(ftable, scratch, flen, start, roseCallback, scratch,
-                          tctxt->groups, stream_state);
+        hwlmExecStreaming(ftable, scratch, flen, start, roseFloatingCallback,
+                          scratch, tctxt->groups & t->floating_group_mask,
+                          stream_state);
     }
 
 flush_delay_and_exit:
@@ -558,3 +649,67 @@ exit:
                  scratch->core_info.status);
     return;
 }
+
+static rose_inline
+void roseStreamInitEod(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    /* TODO: diff groups for eod */
+    tctxt->groups = loadGroups(t, scratch->core_info.state);
+    tctxt->lit_offset_adjust = scratch->core_info.buf_offset
+                             - scratch->core_info.hlen
+                             + 1; // index after last byte
+    tctxt->delayLastEndOffset = offset;
+    tctxt->lastEndOffset = offset;
+    tctxt->filledDelayedSlots = 0;
+    tctxt->lastMatchOffset = 0;
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = offset;
+    tctxt->next_mpv_offset = offset;
+
+    scratch->catchup_pq.qm_size = 0;
+    scratch->al_log_sum = 0; /* clear the anchored logs */
+
+    fatbit_clear(scratch->aqa);
+}
+
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    assert(scratch);
+    assert(t->requiresEodCheck);
+    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
+                 scratch->core_info.len, scratch->core_info.hbuf,
+                 scratch->core_info.hlen);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && offset > t->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("bailing, we are beyond max width\n");
+        /* also some of the history/state may be stale */
+        return;
+    }
+
+    if (!t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod program\n");
+        return;
+    }
+
+    roseStreamInitEod(t, offset, scratch);
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const size_t match_len = 0;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
+                   flags);
+}
diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h
new file mode 100644
index 00000000..b2c2f5d6
--- /dev/null
+++ b/src/rose/validate_mask.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+
+// check positive bytes in cmp_result.
+// return one if the check passed, zero otherwise.
+static really_inline
+int posValidateMask(const u64a cmp_result, const u64a pos_mask) {
+    return !(cmp_result & pos_mask);
+}
+
+/*
+ * check negative bytes in cmp_result.
+ * return one if any byte in cmp_result is not 0, zero otherwise.
+ * check lowest 7 bits and highest bit of every byte respectively.
+ */
+static really_inline
+int negValidateMask(const u64a cmp_result, const u64a neg_mask) {
+    const u64a count_mask = 0x7f7f7f7f7f7f7f7f;
+    // check lowest 7 bits of every byte.
+    // the highest bit should be 1 if check passed.
+    u64a check_low = (cmp_result & count_mask) + count_mask;
+    // check the highest bit of every byte.
+    // combine the highest bit and 0x7f to 0xff if check passes.
+    // flip all 0xff to 0x00 and 0x7f to 0x80.
+    u64a check_all = ~(check_low | cmp_result | count_mask);
+    return !(check_all & neg_mask);
+}
+
+static really_inline
+int validateMask(u64a data, u64a valid_data_mask, u64a and_mask,
+                 u64a cmp_mask, u64a neg_mask) {
+    // skip some byte where valid_data_mask is 0x00 there.
+    and_mask &= valid_data_mask;
+    cmp_mask &= valid_data_mask;
+    neg_mask &= valid_data_mask;
+    u64a cmp_result = (data & and_mask) ^ cmp_mask;
+    /* do the positive check first since it's cheaper */
+    if (posValidateMask(cmp_result, ~neg_mask)
+        && negValidateMask(cmp_result, neg_mask)) {
+        return 1;
+    } else {
+        DEBUG_PRINTF("data %llx valid_data_mask(vdm) %llx\n",
+                     data, valid_data_mask);
+        DEBUG_PRINTF("and_mask & vdm %llx cmp_mask & vdm %llx\n", and_mask,
+                     cmp_mask);
+        DEBUG_PRINTF("cmp_result %llx neg_mask & vdm %llx\n",
+                     cmp_result, neg_mask);
+        return 0;
+    }
+}
diff --git a/src/runtime.c b/src/runtime.c
index 95f21d84..35a11634 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -43,6 +43,7 @@
 #include "nfa/nfa_api_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_rev_api.h"
+#include "nfa/sheng.h"
 #include "smallwrite/smallwrite_internal.h"
 #include "rose/rose.h"
 #include "rose/runtime.h"
@@ -198,7 +199,11 @@ void pureLiteralBlockExec(const struct RoseEngine *rose,
     size_t length = scratch->core_info.len;
     DEBUG_PRINTF("rose engine %d\n", rose->runtimeImpl);
 
-    hwlmExec(ftable, buffer, length, 0, rosePureLiteralCallback, scratch,
+    // RoseContext values that need to be set for use by roseCallback.
+    scratch->tctxt.groups = rose->initialGroups;
+    scratch->tctxt.lit_offset_adjust = 1;
+
+    hwlmExec(ftable, buffer, length, 0, roseCallback, scratch,
              rose->initialGroups);
 }
 
@@ -217,7 +222,6 @@ void initOutfixQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
     q->history = scratch->core_info.hbuf;
     q->hlength = scratch->core_info.hlen;
     q->cb = roseReportAdaptor;
-    q->som_cb = roseReportSomAdaptor;
     q->context = scratch;
     q->report_current = 0;
 
@@ -257,8 +261,8 @@ void soleOutfixBlockExec(const struct RoseEngine *t,
     char rv = nfaQueueExec(q->nfa, q, scratch->core_info.len);
 
     if (rv && nfaAcceptsEod(nfa) && len == scratch->core_info.len) {
-        nfaCheckFinalState(nfa, q->state, q->streamState, q->length,
-                        q->cb, q->som_cb, scratch);
+        nfaCheckFinalState(nfa, q->state, q->streamState, q->length, q->cb,
+                           scratch);
     }
 }
 
@@ -283,13 +287,16 @@ void runSmallWriteEngine(const struct SmallWriteEngine *smwr,
     size_t local_alen = length - smwr->start_offset;
     const u8 *local_buffer = buffer + smwr->start_offset;
 
-    assert(isMcClellanType(nfa->type));
+    assert(isDfaType(nfa->type));
     if (nfa->type == MCCLELLAN_NFA_8) {
         nfaExecMcClellan8_B(nfa, smwr->start_offset, local_buffer,
                             local_alen, roseReportAdaptor, scratch);
-    } else {
+    } else if (nfa->type == MCCLELLAN_NFA_16){
         nfaExecMcClellan16_B(nfa, smwr->start_offset, local_buffer,
                              local_alen, roseReportAdaptor, scratch);
+    } else {
+        nfaExecSheng0_B(nfa, smwr->start_offset, local_buffer,
+                        local_alen, roseReportAdaptor, scratch);
     }
 }
 
@@ -532,7 +539,7 @@ void rawEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
         return;
     }
 
-    roseEodExec(rose, id->offset, scratch);
+    roseStreamEodExec(rose, id->offset, scratch);
 }
 
 static never_inline
@@ -568,7 +575,7 @@ void soleOutfixEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
 
     assert(nfaAcceptsEod(nfa));
     nfaCheckFinalState(nfa, q->state, q->streamState, q->offset, q->cb,
-                       q->som_cb, scratch);
+                       scratch);
 }
 
 static really_inline
@@ -743,11 +750,15 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n",
                  stream_state->offset, scratch->core_info.len);
 
+    // RoseContext values that need to be set for use by roseCallback.
+    scratch->tctxt.groups = loadGroups(rose, scratch->core_info.state);
+    scratch->tctxt.lit_offset_adjust = scratch->core_info.buf_offset + 1;
+
     // Pure literal cases don't have floatingMinDistance set, so we always
     // start the match region at zero.
     const size_t start = 0;
 
-    hwlmExecStreaming(ftable, scratch, len2, start, rosePureLiteralCallback,
+    hwlmExecStreaming(ftable, scratch, len2, start, roseCallback,
                       scratch, rose->initialGroups, hwlm_stream_state);
 
     if (!told_to_stop_matching(scratch) &&
diff --git a/src/scratch.c b/src/scratch.c
index d8742e7d..dae2c672 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -227,6 +227,11 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     // Don't get too big for your boots
     assert((size_t)(current - (char *)s) <= alloc_size);
 
+    // Init q->scratch ptr for every queue.
+    for (struct mq *qi = s->queues; qi != s->queues + queueCount; ++qi) {
+        qi->scratch = s;
+    }
+
     return HS_SUCCESS;
 }
 
diff --git a/src/scratch.h b/src/scratch.h
index f8e322f8..a2f02503 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -45,7 +45,7 @@ extern "C"
 #endif
 
 UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259;
-#define FDR_TEMP_BUF_SIZE 200
+#define FDR_TEMP_BUF_SIZE 220
 
 struct fatbit;
 struct hs_scratch;
@@ -141,7 +141,6 @@ struct match_deduper {
 struct ALIGN_CL_DIRECTIVE hs_scratch {
     u32 magic;
     u8 in_use; /**< non-zero when being used by an API call. */
-    char *scratch_alloc; /* user allocated scratch object */
     u32 queueCount;
     u32 bStateSize; /**< sizeof block mode states */
     u32 tStateSize; /**< sizeof transient rose states */
@@ -161,10 +160,6 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
     struct match_deduper deduper;
     u32 anchored_literal_region_len;
     u32 anchored_literal_count;
-    u32 delay_count;
-    u32 scratchSize;
-    u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
-    u32 handledKeyCount;
     struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already
                                    * handled by this literal */
     u64a *som_store; /**< array of som locations */
@@ -176,6 +171,11 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
                             * location had been writable */
     u64a som_set_now_offset; /**< offset at which som_set_now represents */
     u32 som_store_count;
+    u32 handledKeyCount;
+    u32 delay_count;
+    u32 scratchSize;
+    char *scratch_alloc; /* user allocated scratch object */
+    u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
 };
 
 /* array of fatbit ptr; TODO: why not an array of fatbits? */
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 792a3d5b..90770ba5 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -34,6 +34,7 @@
 #include "nfa/mcclellancompile_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/rdfa_merge.h"
+#include "nfa/shengcompile.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_mcclellan.h"
@@ -65,7 +66,8 @@ namespace { // unnamed
 // Concrete impl class
 class SmallWriteBuildImpl : public SmallWriteBuild {
 public:
-    SmallWriteBuildImpl(const ReportManager &rm, const CompileContext &cc);
+    SmallWriteBuildImpl(size_t num_patterns, const ReportManager &rm,
+                        const CompileContext &cc);
 
     // Construct a runtime implementation.
     aligned_unique_ptr<SmallWriteEngine> build(u32 roseQuality) override;
@@ -73,6 +75,8 @@ public:
     void add(const NGWrapper &w) override;
     void add(const ue2_literal &literal, ReportID r) override;
 
+    set<ReportID> all_reports() const override;
+
     bool determiniseLiterals();
 
     const ReportManager &rm;
@@ -87,11 +91,14 @@ public:
 
 SmallWriteBuild::~SmallWriteBuild() { }
 
-SmallWriteBuildImpl::SmallWriteBuildImpl(const ReportManager &rm_in,
+SmallWriteBuildImpl::SmallWriteBuildImpl(size_t num_patterns,
+                                         const ReportManager &rm_in,
                                          const CompileContext &cc_in)
     : rm(rm_in), cc(cc_in),
       /* small write is block mode only */
-      poisoned(!cc.grey.allowSmallWrite || cc.streaming) {
+      poisoned(!cc.grey.allowSmallWrite
+               || cc.streaming
+               || num_patterns > cc.grey.smallWriteMaxPatterns) {
 }
 
 void SmallWriteBuildImpl::add(const NGWrapper &w) {
@@ -163,6 +170,10 @@ void SmallWriteBuildImpl::add(const ue2_literal &literal, ReportID r) {
     }
 
     cand_literals.push_back(make_pair(literal, r));
+
+    if (cand_literals.size() > cc.grey.smallWriteMaxLiterals) {
+        poisoned = true;
+    }
 }
 
 static
@@ -181,6 +192,7 @@ void lit_to_graph(NGHolder *h, const ue2_literal &literal, ReportID r) {
 bool SmallWriteBuildImpl::determiniseLiterals() {
     DEBUG_PRINTF("handling literals\n");
     assert(!poisoned);
+    assert(cand_literals.size() <= cc.grey.smallWriteMaxLiterals);
 
     if (cand_literals.empty()) {
         return true; /* nothing to do */
@@ -301,6 +313,20 @@ bool is_slow(const raw_dfa &rdfa, const set<dstate_id_t> &accel,
     return true;
 }
 
+static
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+                               const ReportManager &rm,
+                               set<dstate_id_t> &accel_states) {
+    aligned_unique_ptr<NFA> dfa = nullptr;
+    if (cc.grey.allowSmallWriteSheng) {
+        dfa = shengCompile(rdfa, cc, rm, &accel_states);
+    }
+    if (!dfa) {
+        dfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+    }
+    return dfa;
+}
+
 static
 aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
                                    const CompileContext &cc,
@@ -311,9 +337,9 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
     // Unleash the McClellan!
     set<dstate_id_t> accel_states;
 
-    auto nfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+    auto nfa = getDfa(rdfa, cc, rm, accel_states);
     if (!nfa) {
-        DEBUG_PRINTF("mcclellan compile failed for smallwrite NFA\n");
+        DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
         return nullptr;
     }
 
@@ -329,9 +355,9 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
                 return nullptr;
             }
 
-            nfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+            nfa = getDfa(rdfa, cc, rm, accel_states);
             if (!nfa) {
-                DEBUG_PRINTF("mcclellan compile failed for smallwrite NFA\n");
+                DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
                 assert(0); /* able to build orig dfa but not the trimmed? */
                 return nullptr;
             }
@@ -340,7 +366,7 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
         *small_region = cc.grey.smallWriteLargestBuffer;
     }
 
-    assert(isMcClellanType(nfa->type));
+    assert(isDfaType(nfa->type));
     if (nfa->length > cc.grey.limitSmallWriteOutfixSize
         || nfa->length > cc.grey.limitDFASize) {
         DEBUG_PRINTF("smallwrite outfix size too large\n");
@@ -352,9 +378,10 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
 }
 
 // SmallWriteBuild factory
-unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(const ReportManager &rm,
+unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
+                                                  const ReportManager &rm,
                                                   const CompileContext &cc) {
-    return ue2::make_unique<SmallWriteBuildImpl>(rm, cc);
+    return ue2::make_unique<SmallWriteBuildImpl>(num_patterns, rm, cc);
 }
 
 aligned_unique_ptr<SmallWriteEngine>
@@ -403,6 +430,20 @@ SmallWriteBuildImpl::build(u32 roseQuality) {
     return smwr;
 }
 
+set<ReportID> SmallWriteBuildImpl::all_reports() const {
+    set<ReportID> reports;
+    if (poisoned) {
+        return reports;
+    }
+    if (rdfa) {
+        insert(&reports, ::ue2::all_reports(*rdfa));
+    }
+    for (const auto &cand : cand_literals) {
+        reports.insert(cand.second);
+    }
+    return reports;
+}
+
 size_t smwrSize(const SmallWriteEngine *smwr) {
     assert(smwr);
     return smwr->size;
diff --git a/src/smallwrite/smallwrite_build.h b/src/smallwrite/smallwrite_build.h
index 9c3de9d3..84c6df3a 100644
--- a/src/smallwrite/smallwrite_build.h
+++ b/src/smallwrite/smallwrite_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,8 @@
 #include "ue2common.h"
 #include "util/alloc.h"
 
+#include <set>
+
 #include <boost/core/noncopyable.hpp>
 
 struct SmallWriteEngine;
@@ -61,10 +63,13 @@ public:
 
     virtual void add(const NGWrapper &w) = 0;
     virtual void add(const ue2_literal &literal, ReportID r) = 0;
+
+    virtual std::set<ReportID> all_reports() const = 0;
 };
 
 // Construct a usable SmallWrite builder.
-std::unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(const ReportManager &rm,
+std::unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
+                                                       const ReportManager &rm,
                                                        const CompileContext &cc);
 
 size_t smwrSize(const SmallWriteEngine *t);
diff --git a/src/smallwrite/smallwrite_dump.cpp b/src/smallwrite/smallwrite_dump.cpp
index 8987e8b3..0db97df5 100644
--- a/src/smallwrite/smallwrite_dump.cpp
+++ b/src/smallwrite/smallwrite_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -73,7 +73,7 @@ void smwrDumpNFA(const SmallWriteEngine *smwr, bool dump_raw,
     FILE *f;
 
     f = fopen((base + "smallwrite_nfa.dot").c_str(), "w");
-    nfaDumpDot(n, f);
+    nfaDumpDot(n, f, base);
     fclose(f);
 
     f = fopen((base + "smallwrite_nfa.txt").c_str(), "w");
diff --git a/src/som/som_runtime.c b/src/som/som_runtime.c
index 9d0a1390..1a868efc 100644
--- a/src/som/som_runtime.c
+++ b/src/som/som_runtime.c
@@ -87,14 +87,14 @@ char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now,
 }
 
 static
-int somRevCallback(u64a offset, ReportID id, void *ctx) {
-    DEBUG_PRINTF("offset=%llu, id=%u\n", offset, id);
+int somRevCallback(UNUSED u64a start, u64a end, ReportID id, void *ctx) {
+    DEBUG_PRINTF("offset=%llu, id=%u\n", end, id);
 
     // We use the id to store the offset adjustment (for assertions like a
     // leading \b or multiline mode).
     assert(id <= 1);
     u64a *from_offset = ctx;
-    LIMIT_TO_AT_MOST(from_offset, offset + id);
+    LIMIT_TO_AT_MOST(from_offset, end + id);
     return 1; // continue matching.
 }
 
diff --git a/src/ue2common.h b/src/ue2common.h
index 2de60753..e1f03f72 100644
--- a/src/ue2common.h
+++ b/src/ue2common.h
@@ -52,6 +52,9 @@
 #define ALIGN_ATTR(x) __attribute__((aligned((x))))
 #endif
 
+#define ALIGN_DIRECTIVE ALIGN_ATTR(16)
+#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
+#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64)
 
 typedef signed char s8;
 typedef unsigned char u8;
@@ -82,10 +85,6 @@ typedef u32 ReportID;
 #define HS_PUBLIC_API
 #endif
 
-#define ALIGN_DIRECTIVE ALIGN_ATTR(16)
-#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
-#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64)
-
 #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
 
 /** \brief Shorthand for the attribute to shut gcc about unused parameters */
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index c863fba9..6f1bcd09 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -454,4 +454,20 @@ void bf64_unset(u64a *bitfield, u32 i) {
     *bitfield &= ~(1ULL << i);
 }
 
+static really_inline
+u32 rank_in_mask32(u32 mask, u32 bit) {
+    assert(bit < sizeof(u32) * 8);
+    assert(mask & (u32)(1U << bit));
+    mask &= (u32)(1U << bit) - 1;
+    return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64(u64a mask, u32 bit) {
+    assert(bit < sizeof(u64a) * 8);
+    assert(mask & (u64a)(1ULL << bit));
+    mask &= (u64a)(1ULL << bit) - 1;
+    return popcount64(mask);
+}
+
 #endif // BITUTILS_H
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
new file mode 100644
index 00000000..ea22779c
--- /dev/null
+++ b/src/util/clique.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief An algorithm to find cliques.
+ */
+
+#include "clique.h"
+#include "container.h"
+#include "graph_range.h"
+#include "make_unique.h"
+#include "ue2_containers.h"
+
+#include <map>
+#include <set>
+#include <stack>
+
+using namespace std;
+
+namespace ue2 {
+
+static
+vector<u32> getNeighborInfo(const CliqueGraph &g,
+                     const CliqueVertex &cv, const set<u32> &group) {
+    u32 id = g[cv].stateId;
+    vector<u32> neighbor;
+    // find neighbors for cv
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
+            neighbor.push_back(g[v].stateId);
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
+        }
+    }
+
+    return neighbor;
+}
+
+static
+vector<u32> findCliqueGroup(CliqueGraph &cg) {
+    stack<vector<u32>> gStack;
+
+    // Create mapping between vertex and id
+    map<u32, CliqueVertex> vertexMap;
+    vector<u32> init;
+    for (const auto &v : vertices_range(cg)) {
+        vertexMap[cg[v].stateId] = v;
+        init.push_back(cg[v].stateId);
+    }
+    gStack.push(init);
+
+    // Get the vertex to start from
+    vector<u32> clique;
+    while (!gStack.empty()) {
+        vector<u32> g = move(gStack.top());
+        gStack.pop();
+
+        // Choose a vertex from the graph
+        u32 id = g[0];
+        CliqueVertex &n = vertexMap.at(id);
+        clique.push_back(id);
+        // Corresponding vertex in the original graph
+        set<u32> subgraphId(g.begin(), g.end());
+        auto neighbor = getNeighborInfo(cg, n, subgraphId);
+        // Get graph consisting of neighbors for left branch
+        if (!neighbor.empty()) {
+            gStack.push(neighbor);
+        }
+    }
+
+    return clique;
+}
+
+template<typename Graph>
+bool graph_empty(const Graph &g) {
+    typename Graph::vertex_iterator vi, ve;
+    tie(vi, ve) = vertices(g);
+    return vi == ve;
+}
+
+vector<vector<u32>> removeClique(CliqueGraph &cg) {
+    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
+    vector<vector<u32>> cliquesVec = {findCliqueGroup(cg)};
+    while (!graph_empty(cg)) {
+        const vector<u32> &c = cliquesVec.back();
+        vector<CliqueVertex> dead;
+        for (const auto &v : vertices_range(cg)) {
+            u32 id = cg[v].stateId;
+            if (find(c.begin(), c.end(), id) != c.end()) {
+                dead.push_back(v);
+            }
+        }
+        for (const auto &v : dead) {
+            clear_vertex(v, cg);
+            remove_vertex(v, cg);
+        }
+        if (graph_empty(cg)) {
+            break;
+        }
+        auto clique = findCliqueGroup(cg);
+        cliquesVec.push_back(clique);
+    }
+
+    return cliquesVec;
+}
+
+} // namespace ue2
diff --git a/src/nfa/limex_simd512a.c b/src/util/clique.h
similarity index 61%
rename from src/nfa/limex_simd512a.c
rename to src/util/clique.h
index 1c4a0fb9..89c6d4ed 100644
--- a/src/nfa/limex_simd512a.c
+++ b/src/util/clique.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,48 +27,34 @@
  */
 
 /** \file
- * \brief LimEx NFA: 512-bit SIMD runtime implementations.
+ * \brief An algorithm to find cliques.
  */
 
-//#define DEBUG_INPUT
-//#define DEBUG_EXCEPTIONS
+#ifndef CLIQUE_H
+#define CLIQUE_H
 
-#include "limex.h"
-
-#include "accel.h"
-#include "limex_internal.h"
-#include "nfa_internal.h"
 #include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
 
-// Common code
-#include "limex_runtime.h"
+#include <vector>
 
-#define SIZE 512
-#define STATE_T m512
-#include "limex_exceptional.h"
+#include <boost/graph/adjacency_list.hpp>
 
-#define SIZE 512
-#define STATE_T m512
-#include "limex_state_impl.h"
+namespace ue2 {
 
-#define SIZE 512
-#define STATE_T m512
-#define INLINE_ATTR really_inline
-#include "limex_common_impl.h"
+struct CliqueVertexProps {
+    CliqueVertexProps() {}
+    explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               2
-#include "limex_runtime_impl.h"
+    u32 stateId = ~0U;
+};
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               1
-#include "limex_runtime_impl.h"
+typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
+                              CliqueVertexProps> CliqueGraph;
+typedef CliqueGraph::vertex_descriptor CliqueVertex;
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               3
-#include "limex_runtime_impl.h"
+/** \brief Returns a vector of cliques found in a graph. */
+std::vector<std::vector<u32>> removeClique(CliqueGraph &cg);
+
+} // namespace ue2
+
+#endif
diff --git a/src/util/exhaust.h b/src/util/exhaust.h
index b55c52d7..d6f2ac06 100644
--- a/src/util/exhaust.h
+++ b/src/util/exhaust.h
@@ -33,47 +33,9 @@
 #ifndef EXHAUST_H
 #define EXHAUST_H
 
-#include "rose/rose_internal.h"
-#include "util/multibit.h"
 #include "ue2common.h"
 
 /** Index meaning a given exhaustion key is invalid. */
 #define INVALID_EKEY    (~(u32)0)
 
-/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector
- * \a evec. */
-static really_inline
-int isExhausted(const struct RoseEngine *t, const char *evec, u32 ekey) {
-    DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey);
-    assert(ekey != INVALID_EKEY);
-    assert(ekey < t->ekeyCount);
-    return mmbit_isset((const u8 *)evec, t->ekeyCount, ekey);
-}
-
-/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */
-static really_inline
-int isAllExhausted(const struct RoseEngine *t, const char *evec) {
-    if (!t->canExhaust) {
-        return 0; /* pattern set is inexhaustible */
-    }
-
-    return mmbit_all((const u8 *)evec, t->ekeyCount);
-}
-
-/** \brief Mark key \a ekey on in the exhaustion vector. */
-static really_inline
-void markAsMatched(const struct RoseEngine *t, char *evec, u32 ekey) {
-    DEBUG_PRINTF("marking as exhausted key %u\n", ekey);
-    assert(ekey != INVALID_EKEY);
-    assert(ekey < t->ekeyCount);
-    mmbit_set((u8 *)evec, t->ekeyCount, ekey);
-}
-
-/** \brief Clear all keys in the exhaustion vector. */
-static really_inline
-void clearEvec(const struct RoseEngine *t, char *evec) {
-    DEBUG_PRINTF("clearing evec %p %u\n", evec, t->ekeyCount);
-    mmbit_clear((u8 *)evec, t->ekeyCount);
-}
-
 #endif
diff --git a/src/util/masked_move.h b/src/util/masked_move.h
index 93c79e75..09276e80 100644
--- a/src/util/masked_move.h
+++ b/src/util/masked_move.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,6 @@
 
 #include "unaligned.h"
 #include "simd_utils.h"
-#include "simd_utils_ssse3.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/util/shuffle_ssse3.h b/src/util/shuffle_ssse3.h
deleted file mode 100644
index d295839b..00000000
--- a/src/util/shuffle_ssse3.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef SHUFFLE_SSSE3_H
-#define SHUFFLE_SSSE3_H
-
-#include "simd_utils_ssse3.h"
-
-#ifdef DEBUG
-#include "compare.h"
-static really_inline void shufDumpMsk(m128 msk) {
-    u8 * mskAsU8 = (u8 *)&msk;
-    for (int i = 0; i < 16; i++) {
-        u8 c = mskAsU8[i];
-        for (int j = 0; j < 8; j++) {
-            if ((c >> (7-j)) & 0x1)
-                printf("1");
-            else
-                printf("0");
-        }
-        printf(" ");
-    }
-}
-
-static really_inline void shufDumpMskAsChars(m128 msk) {
-    u8 * mskAsU8 = (u8 *)&msk;
-    for (int i = 0; i < 16; i++) {
-        u8 c = mskAsU8[i];
-        if (ourisprint(c))
-            printf("%c",c);
-        else
-            printf(".");
-    }
-}
-#endif
-
-#if !defined(NO_SSSE3)
-static really_inline
-u32 shufflePshufb128(m128 s, const m128 permute, const m128 compare) {
-    m128 shuffled = pshufb(s, permute);
-    m128 compared = and128(shuffled, compare);
-#ifdef DEBUG
-    printf("State:   ");  shufDumpMsk(s);       printf("\n");
-    printf("Permute: ");  shufDumpMsk(permute); printf("\n");
-    printf("Compare: ");  shufDumpMsk(compare); printf("\n");
-    printf("Shuffled: "); shufDumpMsk(shuffled); printf("\n");
-    printf("Compared: "); shufDumpMsk(compared); printf("\n");
-#endif
-    u16 rv = ~cmpmsk8(compared, shuffled);
-    return (u32)rv;
-}
-#endif // NO_SSSE3
-
-#endif // SHUFFLE_SSSE3_H
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 63311b10..e4541411 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,7 +65,7 @@ typedef __m128i m128;
 #if defined(__AVX2__)
 typedef __m256i m256;
 #else
-typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
+typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256;
 #endif
 
 // these should align to 16 and 32 respectively
diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c
new file mode 100644
index 00000000..a86c568d
--- /dev/null
+++ b/src/util/simd_utils.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Lookup tables to support SIMD operations.
+ */
+
+#include "simd_utils.h"
+
+const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+};
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = {
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+};
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 99ad7ce5..3544629f 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -33,6 +33,10 @@
 #ifndef SIMD_UTILS
 #define SIMD_UTILS
 
+#if !defined(_WIN32) && !defined(__SSSE3__)
+#error SSSE3 instructions must be enabled
+#endif
+
 #include "config.h"
 #include <string.h> // for memcpy
 
@@ -68,10 +72,6 @@
 #include "ue2common.h"
 #include "simd_types.h"
 
-#if defined(__GNUC__)
-#define USE_GCC_COMPOUND_STATEMENTS
-#endif
-
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
 #ifdef __cplusplus
@@ -84,23 +84,26 @@
 #  endif
 #endif
 
-#ifdef _WIN32
-#define NO_ASM
-#endif
-
 // Fallback to identity case.
 #ifndef assume_aligned
 #define assume_aligned(x, y) (x)
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const char vbs_mask_data[];
+#ifdef __cplusplus
+}
+#endif
+
 static really_inline m128 ones128(void) {
-#if !defined(NO_ASM)
-    // trick from Intel's optimization guide to generate all-ones. We have to
-    // use inline asm, as there's no intrinsic for this yet.
-    m128 ret;
-    __asm__ ("pcmpeqb %0,%0" : "=x"(ret));
-    return ret;
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
 #else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
     return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
 #endif
 }
@@ -146,34 +149,13 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
-// forward decl
-static really_inline m128 xor128(m128 a, m128 b);
-
-/** \brief Return msb mask of packet 8 bit compare equal */
-static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
-    m128 tmp = _mm_cmpeq_epi8(a, b);
-    return _mm_movemask_epi8(tmp);
-}
-
-#define shift2x64(a, b)  _mm_slli_epi64((a), (b))
-#define rshift2x64(a, b) _mm_srli_epi64((a), (b))
+#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
-
-// We found that this generated better code with gcc-4.1 and with the default
-// tuning settings on gcc-4.4 than just using the _mm_set1_epi8() instrinsic.
 static really_inline m128 set16x8(u8 c) {
-#if !defined(__AVX2__)
-    m128 a = _mm_cvtsi32_si128((int)c);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_shuffle_epi32(a, 0);
-    return a;
-#else
-    // uses a broadcast for much win
     return _mm_set1_epi8(c);
-#endif
 }
 
 static really_inline u32 movd(const m128 in) {
@@ -190,16 +172,8 @@ static really_inline u64a movq(const m128 in) {
 #endif
 }
 
-static really_inline m128 shiftRight8Bits(m128 a) {
-    return _mm_srli_si128(a,1);
-}
-
-static really_inline m128 shiftLeft8Bits(m128 a) {
-    return _mm_slli_si128(a,1);
-}
-
-#define byteShiftRight128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define byteShiftLeft128(a, count_immed) _mm_slli_si128(a, count_immed)
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
 
 #if !defined(__AVX2__)
 // TODO: this entire file needs restructuring - this carveout is awful
@@ -209,8 +183,8 @@ static really_inline m128 shiftLeft8Bits(m128 a) {
 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
 #else
-#define extract32from256(a, imm) movd(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
-#define extract64from256(a, imm) movq(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
 #endif
 
 #endif // !AVX2
@@ -231,10 +205,6 @@ static really_inline m128 andnot128(m128 a, m128 b) {
     return _mm_andnot_si128(a, b);
 }
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD.
-#define shift128(a, b)  _mm_slli_epi64((a), (b))
-
 // aligned load
 static really_inline m128 load128(const void *ptr) {
     assert(ISALIGNED_N(ptr, alignof(m128)));
@@ -275,70 +245,85 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
+extern const u8 simd_onebit_masks[];
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
 // switches on bit N in the given vector.
 static really_inline
 void setbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or128(mask1bit128(n), *ptr);
 }
 
 // switches off bit N in the given vector.
 static really_inline
 void clearbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot128(mask1bit128(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit128(const m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m128 mask = mask1bit128(n);
+#if defined(__SSE4_1__)
+    return !_mm_testz_si128(mask, *ptr);
+#else
+    return isnonzero128(and128(mask, *ptr));
+#endif
 }
 
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m256 vpshufb(m256 a, m256 b) {
+#if defined(__AVX2__)
+    return _mm256_shuffle_epi8(a, b);
+#else
+    m256 rv;
+    rv.lo = pshufb(a.lo, b.lo);
+    rv.hi = pshufb(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb(in, shift_mask);
+}
+
+
 /****
  **** 256-bit Primitives
  ****/
 
 #if defined(__AVX2__)
-#define shift4x64(a, b)  _mm256_slli_epi64((a), (b))
-#define rshift4x64(a, b) _mm256_srli_epi64((a), (b))
+#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
 
 static really_inline
 m256 set32x8(u32 in) {
-    m128 a = _mm_cvtsi32_si128(in);
-    return _mm256_broadcastb_epi8(a);
+    return _mm256_set1_epi8(in);
 }
 
 #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
 #define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
 
-static really_inline u32 cmpmsk16(m256 a, m256 b) {
-    m256 tmp = _mm256_cmpeq_epi8(a, b);
-    return _mm256_movemask_epi8(tmp);
-}
 static really_inline
 m256 set2x128(m128 a) {
     return _mm256_broadcastsi128_si256(a);
@@ -347,18 +332,18 @@ m256 set2x128(m128 a) {
 #else
 
 static really_inline
-m256 shift4x64(m256 a, int b) {
+m256 lshift64_m256(m256 a, int b) {
     m256 rv = a;
-    rv.lo = shift2x64(rv.lo, b);
-    rv.hi = shift2x64(rv.hi, b);
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
     return rv;
 }
 
 static really_inline
-m256 rshift4x64(m256 a, int b) {
+m256 rshift64_m256(m256 a, int b) {
     m256 rv = a;
-    rv.lo = rshift2x64(rv.lo, b);
-    rv.hi = rshift2x64(rv.hi, b);
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
     return rv;
 }
 static really_inline
@@ -382,12 +367,7 @@ static really_inline m256 zeroes256(void) {
 
 static really_inline m256 ones256(void) {
 #if defined(__AVX2__)
-    m256 rv;
-#if defined(NO_ASM)
-    rv = eq256(zeroes256(), zeroes256());
-#else
-    __asm__ ("vpcmpeqb %0,%0,%0" : "=x"(rv));
-#endif
+    m256 rv = _mm256_set1_epi8(0xFF);
 #else
     m256 rv = {ones128(), ones128()};
 #endif
@@ -398,13 +378,6 @@ static really_inline m256 ones256(void) {
 static really_inline m256 and256(m256 a, m256 b) {
     return _mm256_and_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and256(a, b) ({                                                 \
-    m256 rv_and256;                                                     \
-    rv_and256.lo = and128((a).lo, (b).lo);                              \
-    rv_and256.hi = and128((a).hi, (b).hi);                              \
-    rv_and256;                                                          \
-})
 #else
 static really_inline m256 and256(m256 a, m256 b) {
     m256 rv;
@@ -418,13 +391,6 @@ static really_inline m256 and256(m256 a, m256 b) {
 static really_inline m256 or256(m256 a, m256 b) {
     return _mm256_or_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or256(a, b) ({                                                  \
-    m256 rv_or256;                                                      \
-    rv_or256.lo = or128((a).lo, (b).lo);                                \
-    rv_or256.hi = or128((a).hi, (b).hi);                                \
-    rv_or256;                                                           \
-})
 #else
 static really_inline m256 or256(m256 a, m256 b) {
     m256 rv;
@@ -438,13 +404,6 @@ static really_inline m256 or256(m256 a, m256 b) {
 static really_inline m256 xor256(m256 a, m256 b) {
     return _mm256_xor_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor256(a, b) ({                                                 \
-    m256 rv_xor256;                                                     \
-    rv_xor256.lo = xor128((a).lo, (b).lo);                              \
-    rv_xor256.hi = xor128((a).hi, (b).hi);                              \
-    rv_xor256;                                                          \
-})
 #else
 static really_inline m256 xor256(m256 a, m256 b) {
     m256 rv;
@@ -458,13 +417,6 @@ static really_inline m256 xor256(m256 a, m256 b) {
 static really_inline m256 not256(m256 a) {
     return _mm256_xor_si256(a, ones256());
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not256(a) ({                                                    \
-    m256 rv_not256;                                                     \
-    rv_not256.lo = not128((a).lo);                                      \
-    rv_not256.hi = not128((a).hi);                                      \
-    rv_not256;                                                          \
-})
 #else
 static really_inline m256 not256(m256 a) {
     m256 rv;
@@ -478,13 +430,6 @@ static really_inline m256 not256(m256 a) {
 static really_inline m256 andnot256(m256 a, m256 b) {
     return _mm256_andnot_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot256(a, b) ({                                              \
-    m256 rv_andnot256;                                                  \
-    rv_andnot256.lo = andnot128((a).lo, (b).lo);                        \
-    rv_andnot256.hi = andnot128((a).hi, (b).hi);                        \
-    rv_andnot256;                                                       \
-})
 #else
 static really_inline m256 andnot256(m256 a, m256 b) {
     m256 rv;
@@ -494,26 +439,6 @@ static really_inline m256 andnot256(m256 a, m256 b) {
 }
 #endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
-#if defined(__AVX2__)
-#define shift256(a, b)  _mm256_slli_epi64((a), (b))
-#elif defined(__GNUC__)
-#define shift256(a, b)  ({                                              \
-    m256 rv_shift256;                                                   \
-    rv_shift256.lo = shift128(a.lo, b);                                 \
-    rv_shift256.hi = shift128(a.hi, b);                                 \
-    rv_shift256;                                                        \
-})
-#else
-static really_inline m256 shift256(m256 a, unsigned b) {
-    m256 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.hi = shift128(a.hi, b);
-    return rv;
-}
-#endif
-
 static really_inline int diff256(m256 a, m256 b) {
 #if defined(__AVX2__)
     return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
@@ -558,11 +483,10 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
 
 // aligned load
 static really_inline m256 load256(const void *ptr) {
-#if defined(__AVX2__)
     assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(__AVX2__)
     return _mm256_load_si256((const m256 *)ptr);
 #else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
     m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
     return rv;
 #endif
@@ -582,11 +506,10 @@ static really_inline m256 load2x128(const void *ptr) {
 
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
-#if defined(__AVX2__)
     assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(__AVX2__)
     _mm256_store_si256((m256 *)ptr, a);
 #else
-    assert(ISALIGNED_16(ptr));
     ptr = assume_aligned(ptr, 16);
     *(m256 *)ptr = a;
 #endif
@@ -618,6 +541,14 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
     return a;
 }
 
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
 #if !defined(__AVX2__)
 // switches on bit N in the given vector.
 static really_inline
@@ -666,42 +597,19 @@ char testbit256(const m256 *ptr, unsigned int n) {
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or256(mask1bit256(n), *ptr);
 }
 
-// TODO: can we do this better in avx-land?
 static really_inline
 void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot256(mask1bit256(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit256(const m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, *ptr);
 }
 
 static really_really_inline
@@ -714,27 +622,19 @@ m128 movdq_lo(m256 x) {
     return _mm256_extracti128_si256(x, 0);
 }
 
-static really_inline
-m256 shift256Right8Bits(m256 a) {
-    return _mm256_srli_si256(a, 1);
-}
-
-static really_inline
-m256 shift256Left8Bits(m256 a) {
-    return _mm256_slli_si256(a, 1);
-}
 #define cast256to128(a) _mm256_castsi256_si128(a)
 #define cast128to256(a) _mm256_castsi128_si256(a)
 #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
 #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define byteShiftRight256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define byteShiftLeft256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
 #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
 #define extractlow32from256(a) movd(cast256to128(a))
 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b);
 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b);
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
 
 #endif //AVX2
 
@@ -742,15 +642,6 @@ m256 shift256Left8Bits(m256 a) {
  **** 384-bit Primitives
  ****/
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and384(a, b) ({                                                 \
-    m384 rv_and384;                                                     \
-    rv_and384.lo = and128((a).lo, (b).lo);                              \
-    rv_and384.mid = and128((a).mid, (b).mid);                           \
-    rv_and384.hi = and128((a).hi, (b).hi);                              \
-    rv_and384;                                                          \
-})
-#else
 static really_inline m384 and384(m384 a, m384 b) {
     m384 rv;
     rv.lo = and128(a.lo, b.lo);
@@ -758,17 +649,7 @@ static really_inline m384 and384(m384 a, m384 b) {
     rv.hi = and128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or384(a, b) ({                                                  \
-    m384 rv_or384;                                                      \
-    rv_or384.lo = or128((a).lo, (b).lo);                                \
-    rv_or384.mid = or128((a).mid, (b).mid);                             \
-    rv_or384.hi = or128((a).hi, (b).hi);                                \
-    rv_or384;                                                           \
-})
-#else
 static really_inline m384 or384(m384 a, m384 b) {
     m384 rv;
     rv.lo = or128(a.lo, b.lo);
@@ -776,17 +657,7 @@ static really_inline m384 or384(m384 a, m384 b) {
     rv.hi = or128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor384(a, b) ({                                                 \
-    m384 rv_xor384;                                                     \
-    rv_xor384.lo = xor128((a).lo, (b).lo);                              \
-    rv_xor384.mid = xor128((a).mid, (b).mid);                           \
-    rv_xor384.hi = xor128((a).hi, (b).hi);                              \
-    rv_xor384;                                                          \
-})
-#else
 static really_inline m384 xor384(m384 a, m384 b) {
     m384 rv;
     rv.lo = xor128(a.lo, b.lo);
@@ -794,17 +665,6 @@ static really_inline m384 xor384(m384 a, m384 b) {
     rv.hi = xor128(a.hi, b.hi);
     return rv;
 }
-#endif
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not384(a) ({                                                    \
-    m384 rv_not384;                                                     \
-    rv_not384.lo = not128((a).lo);                                      \
-    rv_not384.mid = not128((a).mid);                                    \
-    rv_not384.hi = not128((a).hi);                                      \
-    rv_not384;                                                          \
-})
-#else
 static really_inline m384 not384(m384 a) {
     m384 rv;
     rv.lo = not128(a.lo);
@@ -812,17 +672,6 @@ static really_inline m384 not384(m384 a) {
     rv.hi = not128(a.hi);
     return rv;
 }
-#endif
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot384(a, b) ({                                              \
-    m384 rv_andnot384;                                                  \
-    rv_andnot384.lo = andnot128((a).lo, (b).lo);                        \
-    rv_andnot384.mid = andnot128((a).mid, (b).mid);                     \
-    rv_andnot384.hi = andnot128((a).hi, (b).hi);                        \
-    rv_andnot384;                                                       \
-})
-#else
 static really_inline m384 andnot384(m384 a, m384 b) {
     m384 rv;
     rv.lo = andnot128(a.lo, b.lo);
@@ -830,27 +679,16 @@ static really_inline m384 andnot384(m384 a, m384 b) {
     rv.hi = andnot128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
-#if defined(__GNUC__)
-#define shift384(a, b)  ({                                              \
-    m384 rv;                                                            \
-    rv.lo = shift128(a.lo, b);                                          \
-    rv.mid = shift128(a.mid, b);                                        \
-    rv.hi = shift128(a.hi, b);                                          \
-    rv;                                                                 \
-})
-#else
-static really_inline m384 shift384(m384 a, unsigned b) {
+// The shift amount is an immediate
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
     m384 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.mid = shift128(a.mid, b);
-    rv.hi = shift128(a.hi, b);
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
     return rv;
 }
-#endif
 
 static really_inline m384 zeroes384(void) {
     m384 rv = {zeroes128(), zeroes128(), zeroes128()};
@@ -980,103 +818,49 @@ char testbit384(const m384 *ptr, unsigned int n) {
  **** 512-bit Primitives
  ****/
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and512(a, b) ({                                                 \
-    m512 rv_and512;                                                     \
-    rv_and512.lo = and256((a).lo, (b).lo);                              \
-    rv_and512.hi = and256((a).hi, (b).hi);                              \
-    rv_and512;                                                          \
-})
-#else
 static really_inline m512 and512(m512 a, m512 b) {
     m512 rv;
     rv.lo = and256(a.lo, b.lo);
     rv.hi = and256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or512(a, b) ({                                                  \
-    m512 rv_or512;                                                      \
-    rv_or512.lo = or256((a).lo, (b).lo);                                \
-    rv_or512.hi = or256((a).hi, (b).hi);                                \
-    rv_or512;                                                           \
-})
-#else
 static really_inline m512 or512(m512 a, m512 b) {
     m512 rv;
     rv.lo = or256(a.lo, b.lo);
     rv.hi = or256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor512(a, b) ({                                                 \
-    m512 rv_xor512;                                                     \
-    rv_xor512.lo = xor256((a).lo, (b).lo);                              \
-    rv_xor512.hi = xor256((a).hi, (b).hi);                              \
-    rv_xor512;                                                          \
-})
-#else
 static really_inline m512 xor512(m512 a, m512 b) {
     m512 rv;
     rv.lo = xor256(a.lo, b.lo);
     rv.hi = xor256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not512(a) ({                                                    \
-    m512 rv_not512;                                                     \
-    rv_not512.lo = not256((a).lo);                                      \
-    rv_not512.hi = not256((a).hi);                                      \
-    rv_not512;                                                          \
-})
-#else
 static really_inline m512 not512(m512 a) {
     m512 rv;
     rv.lo = not256(a.lo);
     rv.hi = not256(a.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot512(a, b) ({                                              \
-    m512 rv_andnot512;                                                  \
-    rv_andnot512.lo = andnot256((a).lo, (b).lo);                        \
-    rv_andnot512.hi = andnot256((a).hi, (b).hi);                        \
-    rv_andnot512;                                                       \
-})
-#else
 static really_inline m512 andnot512(m512 a, m512 b) {
     m512 rv;
     rv.lo = andnot256(a.lo, b.lo);
     rv.hi = andnot256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define shift512(a, b)  ({                                              \
-    m512 rv_shift512;                                                   \
-    rv_shift512.lo = shift256(a.lo, b);                                 \
-    rv_shift512.hi = shift256(a.hi, b);                                 \
-    rv_shift512;                                                        \
-})
-#else
-static really_inline m512 shift512(m512 a, unsigned b) {
+// The shift amount is an immediate
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
     m512 rv;
-    rv.lo = shift256(a.lo, b);
-    rv.hi = shift256(a.hi, b);
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
     return rv;
 }
-#endif
 
 static really_inline m512 zeroes512(void) {
     m512 rv = {zeroes256(), zeroes256()};
@@ -1132,19 +916,19 @@ static really_inline u32 diffrich64_512(m512 a, m512 b) {
 
 // aligned load
 static really_inline m512 load512(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
+    assert(ISALIGNED_N(ptr, alignof(m256)));
     m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
     return rv;
 }
 
 // aligned store
 static really_inline void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
 #if defined(__AVX2__)
     m512 *x = (m512 *)ptr;
     store256(&x->lo, a.lo);
     store256(&x->hi, a.hi);
 #else
-    assert(ISALIGNED_16(ptr));
     ptr = assume_aligned(ptr, 16);
     *(m512 *)ptr = a;
 #endif
diff --git a/src/util/simd_utils_ssse3.h b/src/util/simd_utils_ssse3.h
deleted file mode 100644
index 6854ade3..00000000
--- a/src/util/simd_utils_ssse3.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief SIMD primitives specifically for Intel SSSE3 platforms.
- */
-
-#ifndef SIMD_UTILS_SSSE3_H_E27DF795C9AA02
-#define SIMD_UTILS_SSSE3_H_E27DF795C9AA02
-
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
-
-#include "simd_utils.h"
-#include "ue2common.h"
-
-// we may already have x86intrin.h
-#if !defined(USE_X86INTRIN_H)
-#if defined(HAVE_C_INTRIN_H)
-#include <intrin.h>
-#elif defined(HAVE_TMMINTRIN_H)
-#include <tmmintrin.h> // SSSE3 intrinsics
-#else
-#define I_HAVE_BROKEN_INTRINSICS
-#endif
-#endif
-
-
-#if !defined(I_HAVE_BROKEN_INTRINSICS)
-// newish compilers get this right
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-#else
-// must be inline, even in weak-sauce debug builds.
-// oldish compilers either don't have the intrinsic, or force one arg through memory
-static really_really_inline
-m128 palignr(m128 r, m128 l, const int offset) {
-    __asm__ ("palignr   %2,%1,%0" : "+x"(r) : "x"(l), "i"(offset));
-    return r;
-}
-#endif
-
-
-static really_inline
-m128 pshufb(m128 a, m128 b) {
-    m128 result;
-#if !defined(I_HAVE_BROKEN_INTRINSICS)
-    result = _mm_shuffle_epi8(a, b);
-#else
-    __asm__("pshufb\t%1,%0" : "=x"(result) : "xm"(b), "0"(a));
-#endif
-    return result;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const char vbs_mask_data[];
-#ifdef __cplusplus
-}
-#endif
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb(in, shift_mask);
-}
-
-#if defined(__AVX2__)
-
-static really_inline
-m256 vpshufb(m256 a, m256 b) {
-    return _mm256_shuffle_epi8(a, b);
-}
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define vpalignr(r, l, offset) ({                   \
-    m256 res = _mm256_alignr_epi8(r, l, offset);    \
-    res;                                            \
-})
-#else
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-#endif
-
-#else // not __AVX2__
-
-static really_inline
-m256 vpshufb(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = pshufb(a.lo, b.lo);
-    rv.hi = pshufb(a.hi, b.hi);
-    return rv;
-}
-
-/* palignr requires the offset to be an immediate, which we can do with a
- * compound macro, otherwise we have to enumerate the offsets and hope the
- * compiler can throw the rest away. */
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define vpalignr(r, l, offset) ({           \
-    m256 res;                               \
-    res.lo = palignr(r.lo, l.lo, offset);   \
-    res.hi = palignr(r.hi, l.hi, offset);   \
-    res;                                    \
-})
-#else
-#define VPALIGN_CASE(N) case N: \
-		res.lo = palignr(r.lo, l.lo, N); \
-		res.hi = palignr(r.hi, l.hi, N); \
-		return res;
-static really_inline
-m256 vpalignr(m256 r, m256 l, const int offset) {
-	m256 res;
-	switch (offset) {
-	VPALIGN_CASE(0)
-	VPALIGN_CASE(1)
-	VPALIGN_CASE(2)
-	VPALIGN_CASE(3)
-	VPALIGN_CASE(4)
-	VPALIGN_CASE(5)
-	VPALIGN_CASE(6)
-	VPALIGN_CASE(7)
-	VPALIGN_CASE(8)
-	VPALIGN_CASE(9)
-	VPALIGN_CASE(10)
-	VPALIGN_CASE(11)
-	VPALIGN_CASE(12)
-	VPALIGN_CASE(13)
-	VPALIGN_CASE(14)
-	VPALIGN_CASE(15)
-	default:
-		assert(0);
-		return zeroes256();
-	}
-}
-#undef VPALIGN_CASE
-#endif
-#endif // __AVX2__
-
-#endif /* SIMD_UTILS_SSSE3_H_E27DF795C9AA02 */
-
diff --git a/src/util/ue2_containers.h b/src/util/ue2_containers.h
index e3b01363..217d08ea 100644
--- a/src/util/ue2_containers.h
+++ b/src/util/ue2_containers.h
@@ -82,7 +82,7 @@ private:
     void increment() { ++it; }
     void decrement() { --it; }
     void advance(size_t n) { it += n; }
-    typename WrappedIter::difference_type
+    typename std::iterator_traits<WrappedIter>::difference_type
     distance_to(const iter_wrapper &other) const {
         return other.it - it;
     }
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 88695ea9..3c7be473 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,8 +55,6 @@ size_t maxStringSelfOverlap(const std::string &a, bool nocase);
 /// Compares two strings, returns non-zero if they're different.
 u32 cmp(const char *a, const char *b, size_t len, bool nocase);
 
-class CharReach;
-
 struct ue2_literal {
 public:
     /// Single element proxy, pointed to by our const_iterator.
@@ -124,6 +122,13 @@ public:
     ue2_literal &operator=(const ue2_literal &) = default;
     ue2_literal &operator=(ue2_literal &&) = default;
 
+    template<typename InputIt>
+    ue2_literal(InputIt b, InputIt e) {
+        for (; b != e; ++b) {
+            push_back(*b);
+        }
+    }
+
     size_type length() const { return s.length(); }
     bool empty() const { return s.empty(); }
     ue2_literal substr(size_type pos, size_type n = std::string::npos) const;
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 45ea4108..0619c7e4 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -125,12 +125,12 @@
 #define andnot_m384(a, b)   (andnot384(a, b))
 #define andnot_m512(a, b)   (andnot512(a, b))
 
-#define shift_u32(a, b)     ((a) << (b))
-#define shift_u64a(a, b)    ((a) << (b))
-#define shift_m128(a, b)    (shift128(a, b))
-#define shift_m256(a, b)    (shift256(a, b))
-#define shift_m384(a, b)    (shift384(a, b))
-#define shift_m512(a, b)    (shift512(a, b))
+#define lshift_u32(a, b)    ((a) << (b))
+#define lshift_u64a(a, b)   ((a) << (b))
+#define lshift_m128(a, b)   (lshift64_m128(a, b))
+#define lshift_m256(a, b)   (lshift64_m256(a, b))
+#define lshift_m384(a, b)   (lshift64_m384(a, b))
+#define lshift_m512(a, b)   (lshift64_m512(a, b))
 
 #define isZero_u8(a)        ((a) == 0)
 #define isZero_u32(a)       ((a) == 0)
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index a893d3d5..63f3a9ac 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -25,6 +25,11 @@ if(CXX_WUNUSED_VARIABLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
 endif()
 
+if(CMAKE_COMPILER_IS_GNUCC)
+    # spurious warnings?
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds")
+endif()
+
 add_library(gtest STATIC ${gtest_SOURCES})
 
 add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
@@ -65,6 +70,7 @@ set(unit_internal_SOURCES
     internal/pqueue.cpp
     internal/repeat.cpp
     internal/rose_build_merge.cpp
+    internal/rose_mask.cpp
     internal/rvermicelli.cpp
     internal/simd_utils.cpp
     internal/shuffle.cpp
diff --git a/unit/hyperscan/allocators.cpp b/unit/hyperscan/allocators.cpp
index 66c456ee..40c45072 100644
--- a/unit/hyperscan/allocators.cpp
+++ b/unit/hyperscan/allocators.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,9 @@
 #include "test_util.h"
 
 #include <cstdlib>
+#include <string>
+
+using std::string;
 
 static void *null_malloc(size_t) { return nullptr; }
 
@@ -83,6 +86,22 @@ TEST(CustomAllocator, TwoAlignedCompile) {
     hs_set_database_allocator(nullptr, nullptr);
 }
 
+TEST(CustomAllocator, TwoAlignedCompileError) {
+    hs_set_misc_allocator(two_aligned_malloc, two_aligned_free);
+
+    hs_database_t *db = nullptr;
+    hs_compile_error_t *compile_err = nullptr;
+    const hs_platform_info_t *platform = nullptr;
+    hs_error_t err =
+        hs_compile("\\1", 0, HS_MODE_BLOCK, platform, &db, &compile_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_EQ(nullptr, db);
+    ASSERT_NE(nullptr, compile_err);
+    EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message);
+    hs_free_compile_error(compile_err);
+    hs_set_database_allocator(nullptr, nullptr);
+}
+
 TEST(CustomAllocator, TwoAlignedDatabaseInfo) {
     hs_database_t *db = buildDB("foobar", 0, 0, HS_MODE_BLOCK);
     ASSERT_TRUE(db != nullptr);
@@ -149,3 +168,30 @@ TEST(CustomAllocator, TwoAlignedAllocScratch) {
     hs_set_scratch_allocator(nullptr, nullptr);
     hs_free_database(db);
 }
+
+TEST(CustomAllocator, NullMallocExpressionInfo) {
+    hs_set_allocator(null_malloc, nullptr);
+
+    string pattern = "foobar";
+    hs_expr_info_t *info = nullptr;
+    hs_compile_error_t *c_err = nullptr;
+    hs_error_t err = hs_expression_info(pattern.c_str(), 0, &info, &c_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_NE(nullptr, c_err);
+    hs_free_compile_error(c_err);
+    hs_set_allocator(nullptr, nullptr);
+}
+
+TEST(CustomAllocator, TwoAlignedExpressionInfo) {
+    hs_set_misc_allocator(two_aligned_malloc, two_aligned_free);
+
+    string pattern = "\\1";
+    hs_expr_info_t *info = nullptr;
+    hs_compile_error_t *c_err = nullptr;
+    hs_error_t err = hs_expression_info(pattern.c_str(), 0, &info, &c_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_NE(nullptr, c_err);
+    EXPECT_STREQ("Allocator returned misaligned memory.", c_err->message);
+    hs_free_compile_error(c_err);
+    hs_set_allocator(nullptr, nullptr);
+}
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 9fc3a413..1a33210d 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -32,7 +32,6 @@
 31:/\B/W #\B unsupported in UCP mode at index 0.
 32:/foo(?{print "Hello world\n";})bar/ #Embedded code is not supported at index 3.
 33:/the (\S+)(?{ $color = $^N }) (\S+)(?{ $animal = $^N })/i #Embedded code is not supported at index 9.
-34:/foobar\E/s #Unmatched \E at index 6.
 35:/\X/8 #\X unsupported at index 0.
 36:/\B+/ #Invalid repeat at index 2.
 37:/\B?/ #Invalid repeat at index 2.
diff --git a/unit/hyperscan/serialize.cpp b/unit/hyperscan/serialize.cpp
index e13d27b2..7e0fcb7c 100644
--- a/unit/hyperscan/serialize.cpp
+++ b/unit/hyperscan/serialize.cpp
@@ -483,4 +483,71 @@ TEST(Serialize, DeserializeUnalignedMalloc) {
     free(bytes);
 }
 
+TEST(Serialize, DeserializeGarbage) {
+    hs_database_t *db;
+    hs_compile_error_t *c_err;
+    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+
+    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_TRUE(db != nullptr);
+
+    // determine database size for subsequent hs_deserialize_database_at
+    size_t db_len;
+    err = hs_database_size(db, &db_len);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_NE(0, db_len);
+
+    // serialize
+    char *bytes = nullptr;
+    size_t bytes_len = 0;
+
+    err = hs_serialize_database(db, &bytes, &bytes_len);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_NE(0, bytes_len);
+
+    hs_free_database(db);
+
+    // append '\0' byte to the serialized string to spoil it
+    bytes = (char *)realloc(bytes, bytes_len + 1);
+    ASSERT_NE(nullptr, bytes);
+    bytes[bytes_len] = '\0';
+
+    // create set of invalid serializations
+    struct Arg {
+        char *start;
+        size_t len;
+    };
+
+    const Arg invalid_args[] = {
+        {bytes + 1, bytes_len},
+        {bytes + 1, bytes_len - 1},
+        {bytes, bytes_len - 1},
+        {bytes, bytes_len + 1},
+    };
+
+    for (const Arg &arg : invalid_args) {
+        hs_database_t *a_db;
+        err = hs_deserialize_database(arg.start, arg.len, &a_db);
+        ASSERT_NE(HS_SUCCESS, err);
+
+        char *new_db = (char *)malloc(db_len);
+        ASSERT_NE(nullptr, new_db);
+        err = hs_deserialize_database_at(arg.start, arg.len,
+                                         (hs_database_t *)(new_db));
+        ASSERT_NE(HS_SUCCESS, err);
+        free(new_db);
+
+        char *info;
+        err = hs_serialized_database_info(arg.start, arg.len, &info);
+        ASSERT_NE(HS_SUCCESS, err);
+
+        size_t ser_len;
+        err = hs_serialized_database_size(arg.start, arg.len, &ser_len);
+        ASSERT_NE(HS_SUCCESS, err);
+    }
+
+    free(bytes);
+}
+
 }
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index e13270dc..4d476932 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -412,3 +412,27 @@ TEST(BitUtils, bf_it_1) {
     ASSERT_EQ(~0U, bf64_iterate(1ULL << 63, 63));
 }
 
+TEST(BitUtils, rank_in_mask32) {
+    for (u32 i = 0; i < 32; i++) {
+        ASSERT_EQ(i, rank_in_mask32(0xffffffff, i));
+        ASSERT_EQ(0, rank_in_mask32(1U << i, i));
+    }
+    ASSERT_EQ(0, rank_in_mask32(0xf0f0f0f0, 4));
+    ASSERT_EQ(1, rank_in_mask32(0xf0f0f0f0, 5));
+    ASSERT_EQ(3, rank_in_mask32(0xf0f0f0f0, 7));
+    ASSERT_EQ(7, rank_in_mask32(0xf0f0f0f0, 15));
+    ASSERT_EQ(15, rank_in_mask32(0xf0f0f0f0, 31));
+}
+
+TEST(BitUtils, rank_in_mask64) {
+    for (u32 i = 0; i < 64; i++) {
+        ASSERT_EQ(i, rank_in_mask64(0xffffffffffffffffULL, i));
+        ASSERT_EQ(0, rank_in_mask64(1ULL << i, i));
+    }
+    ASSERT_EQ(0, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 4));
+    ASSERT_EQ(1, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 5));
+    ASSERT_EQ(3, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 7));
+    ASSERT_EQ(7, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 15));
+    ASSERT_EQ(15, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 31));
+    ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
+}
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index 2bb359df..e40bda02 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -36,6 +36,7 @@
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_api_util.h"
 #include "nfagraph/ng_lbr.h"
+#include "nfagraph/ng_util.h"
 #include "util/alloc.h"
 #include "util/compile_context.h"
 #include "grey.h"
@@ -71,7 +72,7 @@ struct LbrTestParams {
 };
 
 static
-int onMatch(u64a, ReportID, void *ctx) {
+int onMatch(u64a, u64a, ReportID, void *ctx) {
     unsigned *matches = (unsigned *)ctx;
     (*matches)++;
     return MO_CONTINUE_MATCHING;
@@ -97,6 +98,7 @@ protected:
         ParsedExpression parsed(0, pattern.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         ASSERT_TRUE(isLBR(*g, grey));
 
@@ -122,9 +124,9 @@ protected:
         q.length = 0; // filled in by test
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; // not needed by LBR
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 91ab09db..6bb4fcb9 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -31,14 +31,15 @@
 
 #include "grey.h"
 #include "compiler/compiler.h"
-#include "nfagraph/ng.h"
-#include "nfagraph/ng_limex.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfa/limex_context.h"
 #include "nfa/limex_internal.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_util.h"
 #include "nfa/nfa_internal.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_limex.h"
+#include "nfagraph/ng_restructuring.h"
+#include "nfagraph/ng_util.h"
 #include "util/alloc.h"
 #include "util/target_info.h"
 
@@ -51,7 +52,7 @@ static const string SCAN_DATA = "___foo______\n___foofoo_foo_^^^^^^^^^^^^^^^^^^"
 static const u32 MATCH_REPORT = 1024;
 
 static
-int onMatch(u64a, ReportID, void *ctx) {
+int onMatch(u64a, u64a, ReportID, void *ctx) {
     unsigned *matches = (unsigned *)ctx;
     (*matches)++;
     return MO_CONTINUE_MATCHING;
@@ -76,6 +77,7 @@ protected:
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         rm.setProgramOffset(0, MATCH_REPORT);
 
@@ -102,9 +104,9 @@ protected:
         q.length = SCAN_DATA.size();
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 
@@ -129,7 +131,7 @@ protected:
 
 INSTANTIATE_TEST_CASE_P(
     LimEx, LimExModelTest,
-    Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+    Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExModelTest, StateSize) {
     ASSERT_TRUE(nfa != nullptr);
@@ -292,8 +294,7 @@ TEST_P(LimExModelTest, CheckFinalState) {
 
     // Check for EOD matches.
     char rv = nfaCheckFinalState(nfa.get(), full_state.get(),
-                                 stream_state.get(), end, onMatch, nullptr,
-                                 &matches);
+                                 stream_state.get(), end, onMatch, &matches);
     ASSERT_EQ(MO_CONTINUE_MATCHING, rv);
 }
 
@@ -311,14 +312,14 @@ protected:
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         // Reverse the graph and add some reports on the accept vertices.
         NGHolder g_rev(NFA_REV_PREFIX);
         reverseHolder(*g, g_rev);
-        NFAGraph::inv_adjacency_iterator ai, ae;
-        for (tie(ai, ae) = inv_adjacent_vertices(g_rev.accept, g_rev); ai != ae;
-             ++ai) {
-            g_rev[*ai].reports.insert(0);
+        clearReports(g_rev);
+        for (NFAVertex v : inv_adjacent_vertices_range(g_rev.accept, g_rev)) {
+            g_rev[v].reports.insert(0);
         }
 
         nfa = constructReversedNFA(g_rev, type, cc);
@@ -336,7 +337,7 @@ protected:
 };
 
 INSTANTIATE_TEST_CASE_P(LimExReverse, LimExReverseTest,
-                        Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+                        Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExReverseTest, BlockExecReverse) {
     ASSERT_TRUE(nfa != nullptr);
@@ -370,6 +371,7 @@ protected:
         ReportManager rm(cc.grey);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         rm.setProgramOffset(0, MATCH_REPORT);
 
@@ -396,9 +398,9 @@ protected:
         q.length = ZOMBIE_SCAN_DATA.length();
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 
@@ -422,7 +424,7 @@ protected:
 };
 
 INSTANTIATE_TEST_CASE_P(LimExZombie, LimExZombieTest,
-                        Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+                        Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExZombieTest, GetZombieStatus) {
     ASSERT_TRUE(nfa != nullptr);
diff --git a/unit/internal/multiaccel_matcher.cpp b/unit/internal/multiaccel_matcher.cpp
index 45a24f46..bdf56ff9 100644
--- a/unit/internal/multiaccel_matcher.cpp
+++ b/unit/internal/multiaccel_matcher.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,47 +43,16 @@ extern "C" {
 #include "util/alloc.h"
 #include "util/charreach.h"
 
+#include <algorithm>
+#include <iostream>
+#include <random>
 #include <string>
 #include <vector>
-#include <algorithm>
-#include <stdlib.h>
-#include <iostream>
 
 using namespace ue2;
 using namespace std;
 using namespace testing;
 
-/*
- * Static functions needed for this test's wellbeing
- */
-
-// char generator
-static inline
-char getChar(const CharReach &cr, bool match) {
-    char result;
-    do {
-        result = rand() % CharReach::npos;
-    } while (cr.test(result) != match);
-    return result;
-}
-
-// appends a string with matches/unmatches according to input match pattern
-static
-void getMatch(u8 *result, u32 start, const string &pattern,
-              const CharReach &cr) {
-    for (const auto &c : pattern) {
-        result[start++] = getChar(cr, c == '1');
-    }
-}
-
-// appends non-matching noise of certain lengths
-static
-void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
-    for (unsigned i = 0; i < len; i++) {
-        result[start + i] = getChar(cr, false);
-    }
-}
-
 // test parameters structure
 struct MultiaccelTestParam {
     string match_pattern;
@@ -126,6 +95,34 @@ protected:
         test_all_offsets = p.test_all_offsets;
     }
 
+    char getChar(const CharReach &cr) {
+        assert(cr.count() > 0);
+        auto dist = uniform_int_distribution<size_t>(0, cr.count() - 1);
+        size_t result = cr.find_nth(dist(prng));
+        assert(result != CharReach::npos);
+        return (char)result;
+    }
+
+    // char generator
+    char getChar(const CharReach &cr, bool match) {
+        return getChar(match ? cr : ~cr);
+    }
+
+    // appends a string with matches/unmatches according to input match pattern
+    void getMatch(u8 *result, u32 start, const string &pattern,
+                  const CharReach &cr) {
+        for (const auto &c : pattern) {
+            result[start++] = getChar(cr, c == '1');
+        }
+    }
+
+    // appends non-matching noise of certain lengths
+    void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
+        for (unsigned i = 0; i < len; i++) {
+            result[start + i] = getChar(cr, false);
+        }
+    }
+
     // deferred buffer generation, as we don't know CharReach before we run the test
     void GenerateBuffer(const CharReach &cr) {
         const MultiaccelTestParam &p = GetParam();
@@ -167,6 +164,10 @@ protected:
         aligned_free(buffer);
     }
 
+    // We want our tests to be deterministic, so we use a PRNG in the test
+    // fixture.
+    mt19937 prng;
+
     u32 match_idx;
     u8 *buffer;
     bool test_all_offsets;
diff --git a/unit/internal/nfagraph_equivalence.cpp b/unit/internal/nfagraph_equivalence.cpp
index 3677e1d2..3ca1923f 100644
--- a/unit/internal/nfagraph_equivalence.cpp
+++ b/unit/internal/nfagraph_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,8 @@
  */
 
 /**
- * Unit tests for checking the removeGraphEquivalences code in nfagraph/ng_equivalence.cpp.
+ * Unit tests for checking the removeGraphEquivalences code in
+ * nfagraph/ng_equivalence.cpp.
  */
 
 #include "config.h"
@@ -71,10 +72,9 @@ TEST(NFAGraph, RemoveEquivalence1) {
     ASSERT_EQ(2U, in_degree(g.accept, g));
 
     // Find a vertex that goes right after startDs
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : adjacent_vertices_range(g.startDs, g)) {
+        a = v;
         if (a == g.startDs) {
             continue;
         }
@@ -87,8 +87,8 @@ TEST(NFAGraph, RemoveEquivalence1) {
     ASSERT_TRUE(a != nullptr);
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
-    NFAVertex b = NFAGraph::null_vertex();
-    NFAVertex c = NFAGraph::null_vertex();
+    NFAVertex b = NGHolder::null_vertex();
+    NFAVertex c = NGHolder::null_vertex();
     for (NFAVertex tmp : adjacent_vertices_range(a, g)) {
         const CharReach &tmpcr = g[tmp].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
@@ -133,11 +133,9 @@ TEST(NFAGraph, RemoveEquivalence2) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // Find a vertex leading to accept
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::inv_adjacency_iterator ai, ae;
-    for (tie(ai, ae) = inv_adjacent_vertices(g.accept, g); ai != ae;
-         ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+        a = v;
         if (a == g.accept) {
             continue;
         }
@@ -150,8 +148,8 @@ TEST(NFAGraph, RemoveEquivalence2) {
     ASSERT_TRUE(a != nullptr);
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
-    NFAVertex b = NFAGraph::null_vertex();
-    NFAVertex c = NFAGraph::null_vertex();
+    NFAVertex b = NGHolder::null_vertex();
+    NFAVertex c = NGHolder::null_vertex();
     for (NFAVertex tmp : inv_adjacent_vertices_range(a, g)) {
         const CharReach &tmpcr = g[tmp].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
@@ -197,10 +195,9 @@ TEST(NFAGraph, RemoveEquivalence3) {
     ASSERT_EQ(2U, in_degree(g.accept, g));
 
     // Find a vertex 'a' that goes right after startDs
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : adjacent_vertices_range(g.startDs, g)) {
+        a = v;
         if (a == g.startDs) {
             continue;
         }
@@ -234,10 +231,9 @@ TEST(NFAGraph, RemoveEquivalence3) {
     ASSERT_TRUE(edge(dot2, dot1, g).second);
 
     // now, let's find X and Y nodes
-    NFAVertex X = NFAGraph::null_vertex();
-    NFAVertex Y = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(dot2, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
+    NFAVertex X = NGHolder::null_vertex();
+    NFAVertex Y = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(dot2, g)) {
 
         // we already know about dot1, so skip it
         if (tmp == dot1) {
@@ -290,12 +286,9 @@ TEST(NFAGraph, RemoveEquivalence4) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // Find X and Y nodes that are connected to startDs
-    NFAVertex X = NFAGraph::null_vertex();
-    NFAVertex Y = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex X = NGHolder::null_vertex();
+    NFAVertex Y = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(g.startDs, g)) {
         // skip startDs
         if (tmp == g.startDs) {
             continue;
@@ -341,10 +334,8 @@ TEST(NFAGraph, RemoveEquivalence4) {
     ASSERT_TRUE(edge(dot2, dot1, g).second);
 
     // now find 'a'
-    NFAVertex a = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(dot2, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(dot2, g)) {
         // skip dot1
         if (tmp == dot1) {
             continue;
@@ -392,10 +383,9 @@ TEST(NFAGraph, RemoveEquivalence5) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find first vertex and ensure it has a self loop
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        v = *ai;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.startDs, g)) {
+        v = t;
         if (v == g.startDs) {
             continue;
         }
@@ -409,15 +399,13 @@ TEST(NFAGraph, RemoveEquivalence5) {
     ASSERT_TRUE(v != nullptr);
 
     // now, find the vertex leading to accept
-    NFAVertex v2 = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex v2 = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(v, g)) {
         // skip self-loop
         if (tmp == v) {
             continue;
         }
-        v2 = *ai;
+        v2 = tmp;
         // get char reach
         const CharReach tmpcr = g[tmp].char_reach;
 
@@ -450,10 +438,9 @@ TEST(NFAGraph, RemoveEquivalence6) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find that vertex and ensure it has no self loops and an edge to accept
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        v = *ai;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.startDs, g)) {
+        v = t;
         if (v == g.startDs) {
             continue;
         }
@@ -492,13 +479,12 @@ TEST(NFAGraph, RemoveEquivalence7) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find that vertex and ensure it's a dot self loop and has one outgoing edge
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.start, g); ai != ae; ++ai) {
-        if (*ai == g.startDs) {
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.start, g)) {
+        if (t == g.startDs) {
             continue;
         }
-        v = *ai;
+        v = t;
         // check if it has the right char reach
         const CharReach &tmpcr = g[v].char_reach;
         ASSERT_TRUE(tmpcr.all());
@@ -509,13 +495,13 @@ TEST(NFAGraph, RemoveEquivalence7) {
     ASSERT_TRUE(v != nullptr);
 
     // find the next vertex and ensure it has an edge to accept
-    NFAVertex v2 = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
+    NFAVertex v2 = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(v, g)) {
         // skip self loop
-        if (*ai == v) {
+        if (t == v) {
             continue;
         }
-        v2 = *ai;
+        v2 = t;
         // check if it has the right char reach
         const CharReach &tmpcr = g[v2].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
diff --git a/unit/internal/nfagraph_redundancy.cpp b/unit/internal/nfagraph_redundancy.cpp
index 16266453..acb3cc7b 100644
--- a/unit/internal/nfagraph_redundancy.cpp
+++ b/unit/internal/nfagraph_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,8 @@
  */
 
 /**
- * Unit tests for checking the removeRedundancy code in nfagraph/ng_redundancy.cpp.
+ * Unit tests for checking the removeRedundancy code in
+ * nfagraph/ng_redundancy.cpp.
  */
 
 #include "config.h"
@@ -62,15 +63,17 @@ TEST(NFAGraph, RemoveRedundancy1) {
     // Our graph should only have two non-special nodes
     ASSERT_EQ((size_t)N_SPECIALS + 2, num_vertices(*graph));
 
-    // Dot-star start state should be connected to itself and a single other vertex
+    // Dot-star start state should be connected to itself and a single other
+    // vertex
     ASSERT_EQ(2U, out_degree(graph->startDs, g));
 
     // That single vertex should have reachability [ab]
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(graph->startDs, g); ai != ae; ++ai) {
-        v = *ai;
-        if (v != graph->startDs) break;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(graph->startDs, g)) {
+        v = t;
+        if (v != graph->startDs) {
+            break;
+        }
     }
     const CharReach &cr = g[v].char_reach;
     ASSERT_EQ(2U, cr.count());
@@ -103,35 +106,39 @@ TEST(NFAGraph, RemoveRedundancy2) {
     // Our graph should now have only 3 non-special vertices
     ASSERT_EQ((size_t)N_SPECIALS + 3, num_vertices(*graph));
 
-    // Dot-star start state should be connected to itself and a single other vertex
+    // Dot-star start state should be connected to itself and a single other
+    // vertex
     ASSERT_EQ(2U, out_degree(graph->startDs, g));
 
     // That single vertex should have reachability [a]
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(graph->startDs, g); ai != ae; ++ai) {
-        v = *ai;
-        if (v != graph->startDs) break;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(graph->startDs, g)) {
+        v = t;
+        if (v != graph->startDs) {
+            break;
+        }
     }
     const CharReach &cr = g[v].char_reach;
     ASSERT_EQ(1U, cr.count());
     ASSERT_TRUE(cr.test('a'));
 
-    // 'a' should have two out edges: one to a dot with a cycle (.*) and one to 'c'
+    // 'a' should have two out edges: one to a dot with a cycle (.*) and one to
+    // 'c'
     ASSERT_EQ(2U, out_degree(v, g));
-    NFAVertex dotstar = NFAGraph::null_vertex(), vc = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
-        const CharReach &cr2 = g[*ai].char_reach;
+    NFAVertex dotstar = NGHolder::null_vertex();
+    NFAVertex vc = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(v, g)) {
+        const CharReach &cr2 = g[t].char_reach;
         if (cr2.count() == 1 && cr2.test('c')) {
-            vc = *ai;
+            vc = t;
         } else if (cr2.all()) {
-            dotstar = *ai;
+            dotstar = t;
         } else {
             FAIL();
         }
     }
-    ASSERT_TRUE(vc != NFAGraph::null_vertex());
-    ASSERT_TRUE(dotstar != NFAGraph::null_vertex());
+    ASSERT_TRUE(vc != NGHolder::null_vertex());
+    ASSERT_TRUE(dotstar != NGHolder::null_vertex());
 
     // Dot-star node should have a self-loop and an edge to vertex 'c'
     ASSERT_EQ(2U, out_degree(dotstar, g));
diff --git a/unit/internal/nfagraph_repeat.cpp b/unit/internal/nfagraph_repeat.cpp
index 2473d755..b34d1271 100644
--- a/unit/internal/nfagraph_repeat.cpp
+++ b/unit/internal/nfagraph_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,7 @@
 
 #include "gtest/gtest.h"
 #include "nfagraph/ng_repeat.h"
+#include "nfagraph/ng_util.h"
 #include "util/depth.h"
 #include "hs_compile.h"
 
@@ -89,12 +90,15 @@ static const PureRepeatTest pureRepeatTests[] = {
     { "^..?..?..?..?..?", 5, 10 }
 };
 
-INSTANTIATE_TEST_CASE_P(PureRepeat, NFAPureRepeatTest, ValuesIn(pureRepeatTests));
+INSTANTIATE_TEST_CASE_P(PureRepeat, NFAPureRepeatTest,
+                        ValuesIn(pureRepeatTests));
 
 TEST_P(NFAPureRepeatTest, Check) {
     const PureRepeatTest &t = GetParam();
     SCOPED_TRACE(testing::Message() << "Pattern: " << t.pattern);
-    unique_ptr<NGWrapper> w(constructGraph(t.pattern, HS_FLAG_ALLOWEMPTY));
+    auto w = constructGraph(t.pattern, HS_FLAG_ALLOWEMPTY);
+    ASSERT_TRUE(w != nullptr);
+    clearReports(*w);
 
     PureRepeat repeat;
     bool result = isPureRepeat(*w, repeat);
diff --git a/unit/internal/nfagraph_util.cpp b/unit/internal/nfagraph_util.cpp
index 81dfd682..135276dd 100644
--- a/unit/internal/nfagraph_util.cpp
+++ b/unit/internal/nfagraph_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,24 +85,23 @@ TEST(NFAGraph, split1) {
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'c'));
     }
 
     ASSERT_EQ(8U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'b' && cr <= 'i');
     }
@@ -137,24 +136,23 @@ TEST(NFAGraph, split2) {
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'a' && cr <= 'c');
     }
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'b' && cr <= 'd');
     }
@@ -211,24 +209,23 @@ TEST(NFAGraph, split3) {
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(7U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'g'));
     }
 
     ASSERT_EQ(2U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'h' && cr <= 'i');
     }
@@ -289,13 +286,12 @@ TEST(NFAGraph, split4) {
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(7U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'g'));
     }
@@ -304,12 +300,12 @@ TEST(NFAGraph, split4) {
     ASSERT_TRUE(edge(lhs_map[d], lhs_map[d], lhs).second);
 
     ASSERT_EQ(2U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'h' && cr <= 'i');
     }
diff --git a/unit/internal/rose_build_merge.cpp b/unit/internal/rose_build_merge.cpp
index ad6b0176..3f5a8382 100644
--- a/unit/internal/rose_build_merge.cpp
+++ b/unit/internal/rose_build_merge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,20 +39,12 @@
 #include "util/compile_context.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
+#include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 
 using std::vector;
 using namespace ue2;
 
-static
-std::unique_ptr<RoseBuild> constructBuilder(const Grey &grey) {
-    CompileContext cc(true, false, get_current_target(), grey);
-    ReportManager rm(cc.grey);
-    SomSlotManager ssm(8); // som precision
-    BoundaryReports boundary;
-    return makeRoseBuilder(rm, ssm, cc, boundary);
-}
-
 static
 std::unique_ptr<NGHolder> makeSuffixGraph(ReportID report) {
     auto h = ue2::make_unique<NGHolder>(NFA_SUFFIX);
@@ -100,7 +92,12 @@ size_t numUniqueSuffixGraphs(const RoseGraph &g) {
 
 TEST(RoseMerge, uncalcLeaves_nonleaf) {
     Grey grey;
-    auto build_base = constructBuilder(grey);
+    CompileContext cc(true, false, get_current_target(), grey);
+    ReportManager rm(cc.grey);
+    SomSlotManager ssm(8); // som precision
+    auto smwr = makeSmallWriteBuilder(1, rm, cc);
+    BoundaryReports boundary;
+    auto build_base = makeRoseBuilder(rm, ssm, *smwr, cc, boundary);
     ASSERT_NE(nullptr, build_base);
 
     RoseBuildImpl &build = static_cast<RoseBuildImpl &>(*build_base);
diff --git a/unit/internal/rose_mask.cpp b/unit/internal/rose_mask.cpp
new file mode 100644
index 00000000..e6be00f3
--- /dev/null
+++ b/unit/internal/rose_mask.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "rose/validate_mask.h"
+#include "gtest/gtest.h"
+
+#define ONES64 0xffffffffffffffffULL
+
+/* valid_data_mask is flexible, don't need to be fixed in Info */
+struct ValidateMaskTestInfo {
+    u64a data;
+    u64a and_mask;
+    u64a cmp_mask;
+    u64a neg_mask;
+};
+
+static const ValidateMaskTestInfo testBasic[] = {
+    /* data is randomly picked */
+    {0x1234abcd4321dcbaULL, 0xff09bbdd7f7ffeffULL,
+     0x1200abcd4561dcbbULL, 0xffff00ffULL},
+    /* data = "VaLiDaTe" */
+    {0x56614c6944615465ULL, 0xe0feffffdf7b5480ULL,
+     0x40614c6946615400ULL, 0xff0000ff000000ULL},
+    /* data = "\0\0\0MASK\0" */
+    {0x4d41534b00ULL, 0xfffffefebfdf002cULL,
+     0x5536344c0173002cULL, 0xffffff0000ff00ffULL},
+    /* data = "FOo14foo" */
+    {0x464f6f3134666f6fULL, 0xdfdffffef8c0f000ULL,
+     0x46466f3030406000ULL, 0xff000000000000ULL},
+    /* data = "FOo14foo" with different cmp_mask and neg_mask*/
+    {0x464f6f3134666f6fULL, 0xdfdffffef8c0f000ULL,
+     0x44464f3034606f60ULL, 0xffffff00ffffffffULL},
+};
+
+/*
+ * generate 37 different valid_data_mask
+ * 8 from 0xff to 0xff00000000000000
+ * 7 from 0xffff to 0xffff000000000000
+ * ...
+ * 0xffffffffffffffff and 0
+ */
+static int initLegalValidMasks(u64a validMasks[]) {
+    u64a data = ONES64;
+    int num = 0;
+    for (int i = 0; i < 64; i += 8) {
+        for (int j = 0; j <= i; j += 8) {
+            validMasks[num] = data << j;
+            num++;
+        }
+        data >>= 8;
+    }
+    validMasks[num] = 0;
+    num++;
+    return num;
+}
+
+/*
+ * generate all 256 neg_masks
+ * including 0, 0xff, 0xff00,..., 0xffffffffffffffff
+ */
+static int initLegalNegMasks(u64a negMasks[]) {
+    u64a data = 0;
+    u64a offset;
+    int num = 0;
+    while (data != ONES64) {
+        negMasks[num] = data;
+        num++;
+        offset = (data | (data +1)) ^ data;
+        data += 0xfeULL * offset + 1;
+    }
+    negMasks[num] = data;
+    num++;
+    return num;
+}
+
+
+/*
+ * check all legal valid_mask(37 different) for validateMask[]
+ */
+TEST(ValidateMask, ValidMaskTests) {
+    u64a validMasks[256];
+    int num = initLegalValidMasks(validMasks);
+
+    for (const auto &t : testBasic) {
+        for (int i = 0; i < num; i++) {
+            EXPECT_EQ(1, validateMask(t.data,
+                                      validMasks[i],
+                                      t.and_mask,
+                                      t.cmp_mask,
+                                      t.neg_mask));
+        }
+    }
+}
+
+/*
+ * fix neg_mask to 0 and ONES64,
+ * check output of ValidateMask on different valid_mask,
+ * for neg_mask = 0,
+ */
+TEST(ValidateMask, AdvancedValidMaskTests) {
+    u64a validMasks[256];
+    int num = initLegalValidMasks(validMasks);
+    int bool_result;
+    for (const auto &t: testBasic) {
+        for (int i = 0; i < num; i++) {
+            bool_result = !(validMasks[i] & t.neg_mask);
+            EXPECT_EQ(bool_result, validateMask(t.data,
+                                                validMasks[i],
+                                                t.and_mask,
+                                                t.cmp_mask,
+                                                0));
+            bool_result = (validMasks[i] | t.neg_mask) == t.neg_mask;
+            EXPECT_EQ(bool_result, validateMask(t.data,
+                                                validMasks[i],
+                                                t.and_mask,
+                                                t.cmp_mask,
+                                                ONES64));
+        }
+    }
+}
+
+/*
+ * test every pair of valid_data_mask and neg_mask
+ * and compute the expect output by a formula
+ */
+TEST(ValidateMask, FullTests) {
+    u64a validMasks[256];
+    u64a negMasks[256];
+    int vm_num = initLegalValidMasks(validMasks);
+    int nm_num = initLegalNegMasks(negMasks);
+    int bool_result;
+    for (const auto &t: testBasic) {
+        for (int i = 0; i < vm_num; i++) {
+            for (int j = 0; j < nm_num; j++) {
+                /*
+                 * treat t.neg_mask as a truthtable (a negative truthtable)
+                 * we expect validateMask output 1 if and only if
+                 * the truthtable(tt) and neg_mask(nm) looks same
+                 * under "&" operation with valid_data_mask(vdm)
+                 * that is
+                 * output = (tt & vdm) == (nm & vdm) ? 1 : 0;
+                 */
+                bool_result = (t.neg_mask & validMasks[i]) ==
+                              (negMasks[j] & validMasks[i]);
+                EXPECT_EQ(bool_result, validateMask(t.data,
+                                                    validMasks[i],
+                                                    t.and_mask,
+                                                    t.cmp_mask,
+                                                    negMasks[j]));
+            }
+        }
+    }
+}
+
+/*
+ * drop the original validateMask[].neg_mask
+ * and test more neg_mask and valid_mask manually
+ */
+TEST(ValidateMask, ManualTest_0) {
+    const auto &t = testBasic[0];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 16) >> 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xffffff00ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffffffff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ~t.neg_mask));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffffffffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffULL));
+}
+
+TEST(ValidateMask, ManualTest_1) {
+    const auto &t = testBasic[1];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 32,
+                              t.and_mask, t.cmp_mask, 0xff000000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 32,
+                              t.and_mask, t.cmp_mask, 0xff0000ffff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ff000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xff000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 16,
+                              t.and_mask, t.cmp_mask, 0xff00ff00));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xff00000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 48,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffff0000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 8,
+                              t.and_mask, t.cmp_mask, 0xff000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xffff000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 40) >> 16,
+                              t.and_mask, t.cmp_mask, 0xff000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, ONES64));
+}
+
+TEST(ValidateMask, ManualTest_2) {
+    const auto &t = testBasic[2];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 24,
+                              t.and_mask, t.cmp_mask, 0xffffff0000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00ffffff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffff00ff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 56) >> 40,
+                              t.and_mask, t.cmp_mask, 0xff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 56) >> 32,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xffffffff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 48,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffffff00000000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffff00ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffffffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xffffff00000000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xffffff000000ff00ULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 56) >> 40,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 56) >> 48,
+                              t.and_mask, t.cmp_mask, 0xff00ULL));
+}
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index 58e5a61f..614b641d 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,8 +31,7 @@
 #include "gtest/gtest.h"
 
 #include "util/simd_utils.h"
-#include "util/shuffle.h"
-#include "util/shuffle_ssse3.h"
+#include "nfa/limex_shuffle.h"
 
 namespace {
 
@@ -50,34 +49,34 @@ Mask setbit(unsigned int bit) {
     return cf.simd;
 }
 
-TEST(Shuffle, ShuffleDynamic32_1) {
+TEST(Shuffle, PackedExtract32_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 32; i++) {
         // shuffle a single 1 bit to the front
         u32 mask = 1U << i;
-        EXPECT_EQ(1U, shuffleDynamic32(mask, mask));
-        EXPECT_EQ(1U, shuffleDynamic32(~0U, mask));
+        EXPECT_EQ(1U, packedExtract32(mask, mask));
+        EXPECT_EQ(1U, packedExtract32(~0U, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shuffleDynamic32(0, mask));
-        EXPECT_EQ(0U, shuffleDynamic32(~mask, mask));
+        EXPECT_EQ(0U, packedExtract32(0, mask));
+        EXPECT_EQ(0U, packedExtract32(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 32); j++) {
-            EXPECT_EQ(0U, shuffleDynamic32((1U << j), mask));
+            EXPECT_EQ(0U, packedExtract32((1U << j), mask));
         }
     }
 }
 
-TEST(Shuffle, ShuffleDynamic32_2) {
+TEST(Shuffle, PackedExtract32_2) {
     // All 32 bits in mask are on
     u32 mask = ~0U;
-    EXPECT_EQ(0U, shuffleDynamic32(0, mask));
-    EXPECT_EQ(mask, shuffleDynamic32(mask, mask));
+    EXPECT_EQ(0U, packedExtract32(0, mask));
+    EXPECT_EQ(mask, packedExtract32(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, shuffleDynamic32(1U << i, mask));
+        EXPECT_EQ(1U << i, packedExtract32(1U << i, mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic32_3) {
+TEST(Shuffle, PackedExtract32_3) {
     // Try setting every second bit
     u32 mask = 0;
     for (unsigned int i = 0; i < 32; i += 2) {
@@ -85,63 +84,63 @@ TEST(Shuffle, ShuffleDynamic32_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ((1U << 16) - 1, shuffleDynamic32(mask, mask));
-    EXPECT_EQ((1U << 16) - 1, shuffleDynamic32(~mask, ~mask));
-    EXPECT_EQ(0U, shuffleDynamic32(~mask, mask));
-    EXPECT_EQ(0U, shuffleDynamic32(mask, ~mask));
+    EXPECT_EQ((1U << 16) - 1, packedExtract32(mask, mask));
+    EXPECT_EQ((1U << 16) - 1, packedExtract32(~mask, ~mask));
+    EXPECT_EQ(0U, packedExtract32(~mask, mask));
+    EXPECT_EQ(0U, packedExtract32(mask, ~mask));
 
     for (unsigned int i = 0; i < 32; i += 2) {
-        EXPECT_EQ(1U << (i/2), shuffleDynamic32(1U << i, mask));
-        EXPECT_EQ(0U, shuffleDynamic32(1U << i, ~mask));
-        EXPECT_EQ(1U << (i/2), shuffleDynamic32(1U << (i+1), ~mask));
-        EXPECT_EQ(0U, shuffleDynamic32(1U << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), packedExtract32(1U << i, mask));
+        EXPECT_EQ(0U, packedExtract32(1U << i, ~mask));
+        EXPECT_EQ(1U << (i/2), packedExtract32(1U << (i+1), ~mask));
+        EXPECT_EQ(0U, packedExtract32(1U << (i+1), mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_1) {
+TEST(Shuffle, PackedExtract64_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 64; i++) {
         // shuffle a single 1 bit to the front
         u64a mask = 1ULL << i;
-        EXPECT_EQ(1U, shuffleDynamic64(mask, mask));
-        EXPECT_EQ(1U, shuffleDynamic64(~0ULL, mask));
+        EXPECT_EQ(1U, packedExtract64(mask, mask));
+        EXPECT_EQ(1U, packedExtract64(~0ULL, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-        EXPECT_EQ(0U, shuffleDynamic64(~mask, mask));
+        EXPECT_EQ(0U, packedExtract64(0, mask));
+        EXPECT_EQ(0U, packedExtract64(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 64); j++) {
-            EXPECT_EQ(0U, shuffleDynamic64((1ULL << j), mask));
+            EXPECT_EQ(0U, packedExtract64((1ULL << j), mask));
         }
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_2) {
+TEST(Shuffle, PackedExtract64_2) {
     // Fill first half of mask
     u64a mask = 0x00000000ffffffffULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << i, packedExtract64(1ULL << i, mask));
     }
 
     // Fill second half of mask
     mask = 0xffffffff00000000ULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 32; i < 64; i++) {
-        EXPECT_EQ(1U << (i - 32), shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 32), packedExtract64(1ULL << i, mask));
     }
 
     // Try one in the middle
     mask = 0x0000ffffffff0000ULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 16; i < 48; i++) {
-        EXPECT_EQ(1U << (i - 16), shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 16), packedExtract64(1ULL << i, mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_3) {
+TEST(Shuffle, PackedExtract64_3) {
     // Try setting every second bit (note: 32 bits, the max we can shuffle)
     u64a mask = 0;
     for (unsigned int i = 0; i < 64; i += 2) {
@@ -149,46 +148,69 @@ TEST(Shuffle, ShuffleDynamic64_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(~mask, ~mask));
-    EXPECT_EQ(0U, shuffleDynamic64(~mask, mask));
-    EXPECT_EQ(0U, shuffleDynamic64(mask, ~mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(~mask, ~mask));
+    EXPECT_EQ(0U, packedExtract64(~mask, mask));
+    EXPECT_EQ(0U, packedExtract64(mask, ~mask));
 
     for (unsigned int i = 0; i < 64; i += 2) {
-        EXPECT_EQ(1U << (i/2), shuffleDynamic64(1ULL << i, mask));
-        EXPECT_EQ(0U, shuffleDynamic64(1ULL << i, ~mask));
-        EXPECT_EQ(1U << (i/2), shuffleDynamic64(1ULL << (i+1), ~mask));
-        EXPECT_EQ(0U, shuffleDynamic64(1ULL << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(0U, packedExtract64(1ULL << i, ~mask));
+        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << (i+1), ~mask));
+        EXPECT_EQ(0U, packedExtract64(1ULL << (i+1), mask));
     }
 }
 
+template<typename T>
 static
-void build_pshufb_masks_onebit(unsigned int bit, m128 *permute, m128 *compare) {
+void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
+    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256),
+                  "should be valid type");
     // permute mask has 0x80 in all bytes except the one we care about
     memset(permute, 0x80, sizeof(*permute));
     memset(compare, 0, sizeof(*compare));
     char *pmsk = (char *)permute;
     char *cmsk = (char *)compare;
-    pmsk[0] = bit/8;
-    cmsk[0] = ~(1 << (bit % 8));
+    u8 off = (bit >= 128) ? 0x10 : 0;
+    pmsk[off] = bit/8;
+    cmsk[off] = ~(1 << (bit % 8));
 }
 
-TEST(Shuffle, ShufflePshufb128_1) {
+TEST(Shuffle, PackedExtract128_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 128; i++) {
         // shuffle a single 1 bit to the front
         m128 permute, compare;
         build_pshufb_masks_onebit(i, &permute, &compare);
-        EXPECT_EQ(1U, shufflePshufb128(setbit<m128>(i), permute, compare));
-        EXPECT_EQ(1U, shufflePshufb128(ones128(), permute, compare));
+        EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shufflePshufb128(zeroes128(), permute, compare));
-        EXPECT_EQ(0U, shufflePshufb128(not128(setbit<m128>(i)), permute, compare));
+        EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare));
+        EXPECT_EQ(0U, packedExtract128(not128(setbit<m128>(i)), permute, compare));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 128); j++) {
-            EXPECT_EQ(0U, shufflePshufb128(setbit<m128>(j), permute, compare));
+            EXPECT_EQ(0U, packedExtract128(setbit<m128>(j), permute, compare));
         }
     }
 }
 
+#if defined(__AVX2__)
+TEST(Shuffle, PackedExtract256_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 256; i++) {
+        // shuffle a single 1 bit to the front
+        m256 permute, compare;
+        build_pshufb_masks_onebit(i, &permute, &compare);
+        EXPECT_EQ(1U, packedExtract256(setbit<m256>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract256(ones256(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract256(zeroes256(), permute, compare));
+        EXPECT_EQ(0U, packedExtract256(not256(setbit<m256>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 256); j++) {
+            EXPECT_EQ(0U, packedExtract256(setbit<m256>(j), permute, compare));
+        }
+    }
+}
+#endif
 } // namespace
diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp
index b8d77d37..81495a9c 100644
--- a/unit/internal/shufti.cpp
+++ b/unit/internal/shufti.cpp
@@ -283,9 +283,9 @@ TEST(DoubleShufti, BuildMask1) {
 
     lits.insert(make_pair('a', 'B'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -326,9 +326,9 @@ TEST(DoubleShufti, BuildMask2) {
     lits.insert(make_pair('a','z'));
     lits.insert(make_pair('B','z'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -354,9 +354,9 @@ TEST(DoubleShufti, BuildMask4) {
     lits.insert(make_pair('A','z'));
     lits.insert(make_pair('b','z'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -383,9 +383,9 @@ TEST(DoubleShufti, BuildMask5) {
     CharReach bytes;
     bytes.set('X');
 
-    bool rv = shuftiBuildDoubleMasks(bytes, lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(bytes, lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -421,9 +421,9 @@ TEST(DoubleShufti, BuildMask6) {
     lits.insert(make_pair('A','x'));
     lits.insert(make_pair('b','x'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -485,9 +485,9 @@ TEST(DoubleShufti, ExecNoMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1,
                                      &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -506,8 +506,8 @@ TEST(DoubleShufti, ExecNoMatch1b) {
 
     lits.insert(make_pair('b','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -527,8 +527,8 @@ TEST(DoubleShufti, ExecNoMatch2) {
     lits.insert(make_pair('a','b'));
     lits.insert(make_pair('B','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -548,8 +548,8 @@ TEST(DoubleShufti, ExecNoMatch2b) {
     lits.insert(make_pair('b','a'));
     lits.insert(make_pair('b','B'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -568,8 +568,8 @@ TEST(DoubleShufti, ExecNoMatch3) {
 
     lits.insert(make_pair('V','e'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
 
@@ -588,8 +588,8 @@ TEST(DoubleShufti, ExecNoMatch3b) {
 
     lits.insert(make_pair('e','V'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
 
@@ -608,8 +608,8 @@ TEST(DoubleShufti, ExecMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -629,8 +629,8 @@ TEST(DoubleShufti, ExecMatch2) {
 
     lits.insert(make_pair('a','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -651,8 +651,8 @@ TEST(DoubleShufti, ExecMatch3) {
     lits.insert(make_pair('B','a'));
     lits.insert(make_pair('a','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbBaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -675,8 +675,8 @@ TEST(DoubleShufti, ExecMatch4) {
     lits.insert(make_pair('C','a'));
     lits.insert(make_pair('c','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -717,8 +717,8 @@ TEST(DoubleShufti, ExecMatch4b) {
     lits.insert(make_pair('a','C'));
     lits.insert(make_pair('a','c'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaAaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -756,8 +756,8 @@ TEST(DoubleShufti, ExecMatch5) {
 
     lits.insert(make_pair('a','A'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -780,8 +780,8 @@ TEST(DoubleShufti, ExecMatchMixed1) {
     // just one one-byte literal
     onebyte.set('a');
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -804,8 +804,8 @@ TEST(DoubleShufti, ExecMatchMixed2) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     char t2[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -838,8 +838,8 @@ TEST(DoubleShufti, ExecMatchMixed3) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     const int len = 420;
     char t1[len + 1];
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index de0f1eea..3c07b2b0 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -32,7 +32,6 @@
 #include "util/alloc.h"
 #include "util/make_unique.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 using namespace std;
 using namespace ue2;
@@ -644,50 +643,50 @@ TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 0),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 1),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
                          variable_byte_shift_m128(in, -1)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 2),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
                          variable_byte_shift_m128(in, -2)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 3),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
                          variable_byte_shift_m128(in, -3)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 4),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
                          variable_byte_shift_m128(in, -4)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 5),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
                          variable_byte_shift_m128(in, -5)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 6),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
                          variable_byte_shift_m128(in, -6)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 7),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
                          variable_byte_shift_m128(in, -7)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 8),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
                          variable_byte_shift_m128(in, -8)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 9),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
                          variable_byte_shift_m128(in, -9)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 10),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, -10)));
 
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 0),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 1),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
                          variable_byte_shift_m128(in, 1)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 2),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
                          variable_byte_shift_m128(in, 2)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 3),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
                          variable_byte_shift_m128(in, 3)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 4),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
                          variable_byte_shift_m128(in, 4)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 5),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
                          variable_byte_shift_m128(in, 5)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 6),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
                          variable_byte_shift_m128(in, 6)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 7),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
                          variable_byte_shift_m128(in, 7)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 8),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
                          variable_byte_shift_m128(in, 8)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 9),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
                          variable_byte_shift_m128(in, 9)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 10),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, 10)));
 
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 30629f71..9fa6743e 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -158,7 +158,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
         DEBUG_PRINTF("dequeuing path %s, back %u\n",
                      pathToString(g, *p).c_str(), g[u].index);
 
-        NFAGraph::adjacency_iterator ai, ae;
+        NGHolder::adjacency_iterator ai, ae;
         for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) {
             NFAVertex v = *ai;
 
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index 4d188d78..60ff0a17 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -76,7 +76,7 @@ struct fmstate {
     fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in,
             const ReportManager &rm_in)
         : num_states(num_vertices(g)), states(num_states), next(num_states),
-          vertices(num_vertices(g), NFAGraph::null_vertex()), som(som_in),
+          vertices(num_vertices(g), NGHolder::null_vertex()), som(som_in),
           utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states),
           accept_with_eod(num_states) {
         // init states