diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e28e3b1..97b311e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [4.6.0] 2017-09-22
+- New API feature: stream state compression. This allows the user to compress
+  and restore state for streams to reduce memory usage.
+- Many improvements to literal matching performance, including more support
+  for Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512).
+- Compile time improvements, mainly reducing compiler memory allocation.
+  Also results in reduced compile time for some pattern sets.
+- Bugfix for issue #62: fix error building Hyperscan using older versions of
+  Boost.
+- Small updates to fix warnings identified by Coverity.
+
 ## [4.5.2] 2017-07-26
 - Bugfix for issue #57: Treat characters between `\Q.\E` as codepoints in
   UTF8 mode.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c2e298a..59a3292b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 5)
-set (HS_PATCH_VERSION 2)
+set (HS_MINOR_VERSION 6)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -38,6 +38,7 @@ endif()
 
 set(BINDIR "${PROJECT_BINARY_DIR}/bin")
 set(LIBDIR "${PROJECT_BINARY_DIR}/lib")
+
 set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})
 
 # First for the generic no-config case
@@ -57,6 +58,11 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
     set(XCODE TRUE)
 endif()
 
+# older versions of cmake don't know things support isystem
+if (XCODE OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem")
+endif ()
+
 set(CMAKE_INCLUDE_CURRENT_DIR 1)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
@@ -148,8 +154,9 @@ if(MSVC OR MSVC_IDE)
             # todo: change these as required
             set(ARCH_C_FLAGS "/arch:AVX2")
             set(ARCH_CXX_FLAGS "/arch:AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /wd4244 /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 -D_CRT_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 ${MSVC_WARNS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
         endif()
         string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
         string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
@@ -248,7 +255,13 @@ else()
     endif()
 
     if(CMAKE_COMPILER_IS_GNUCXX)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0 -Wno-unused-local-typedefs -Wno-maybe-uninitialized")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
+        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+            set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
+        endif ()
+        # don't complain about abi
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
     endif()
 
     if (NOT(ARCH_IA32 AND RELEASE_BUILD))
@@ -256,11 +269,6 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
     endif()
 
-    if (RELEASE_BUILD)
-        # we don't need the noise of ABI warnings in a release build
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
-    endif ()
 
     if (CMAKE_C_COMPILER_ID MATCHES "Intel")
         set(SKYLAKE_FLAG "-xCORE-AVX512")
@@ -396,18 +404,14 @@ if (CXX_MISSING_DECLARATIONS)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
 endif()
 
+CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
+
 # gcc5 complains about this
 CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
 
 endif()
 
-if (NOT XCODE)
-    include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-else()
-    # cmake doesn't think Xcode supports isystem
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${Boost_INCLUDE_DIRS}")
-endif()
-
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
     set(LINUX TRUE)
@@ -419,10 +423,10 @@ endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 
 if(NOT WIN32)
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable=remark")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable=remark")
 endif()
 if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable=remark")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable=remark")
 endif()
 endif()
 
@@ -513,6 +517,9 @@ set (hs_exec_SRCS
     src/crc32.h
     src/report.h
     src/runtime.c
+    src/stream_compress.c
+    src/stream_compress.h
+    src/stream_compress_impl.h
     src/fdr/fdr.c
     src/fdr/fdr.h
     src/fdr/fdr_internal.h
@@ -629,6 +636,7 @@ set (hs_exec_SRCS
     src/util/masked_move.h
     src/util/multibit.h
     src/util/multibit.c
+    src/util/multibit_compress.h
     src/util/multibit_internal.h
     src/util/pack_bits.h
     src/util/popcount.h
@@ -651,7 +659,7 @@ set (hs_exec_avx2_SRCS
 )
 
 
-SET (hs_SRCS
+SET (hs_compile_SRCS
     ${hs_HEADERS}
     src/crc32.h
     src/database.h
@@ -659,7 +667,6 @@ SET (hs_SRCS
     src/grey.h
     src/hs.cpp
     src/hs_internal.h
-    src/hs_version.c
     src/hs_version.h
     src/scratch.h
     src/state.h
@@ -735,6 +742,7 @@ SET (hs_SRCS
     src/nfa/nfa_build_util.h
     src/nfa/nfa_internal.h
     src/nfa/nfa_kind.h
+    src/nfa/rdfa.cpp
     src/nfa/rdfa.h
     src/nfa/rdfa_graph.cpp
     src/nfa/rdfa_graph.h
@@ -960,6 +968,7 @@ SET (hs_SRCS
     src/rose/rose_build_merge.cpp
     src/rose/rose_build_merge.h
     src/rose/rose_build_misc.cpp
+    src/rose/rose_build_misc.h
     src/rose/rose_build_program.cpp
     src/rose/rose_build_program.h
     src/rose/rose_build_resources.h
@@ -996,9 +1005,13 @@ SET (hs_SRCS
     src/util/dump_mask.h
     src/util/fatbit_build.cpp
     src/util/fatbit_build.h
+    src/util/flat_containers.h
     src/util/graph.h
+    src/util/graph_range.h
+    src/util/graph_small_color_map.h
     src/util/hash.h
     src/util/hash_dynamic_bitset.h
+    src/util/insertion_ordered.h
     src/util/math.h
     src/util/multibit_build.cpp
     src/util/multibit_build.h
@@ -1016,7 +1029,6 @@ SET (hs_SRCS
     src/util/small_vector.h
     src/util/target_info.cpp
     src/util/target_info.h
-    src/util/ue2_containers.h
     src/util/ue2_graph.h
     src/util/ue2string.cpp
     src/util/ue2string.h
@@ -1024,6 +1036,7 @@ SET (hs_SRCS
     src/util/unicode_def.h
     src/util/unicode_set.h
     src/util/uniform_ops.h
+    src/util/unordered.h
     src/util/verify_types.h
 )
 
@@ -1076,7 +1089,7 @@ set(hs_dump_SRCS
 )
 
 if (DUMP_SUPPORT)
-    set(hs_SRCS ${hs_SRCS} ${hs_dump_SRCS})
+    set(hs_compile_SRCS ${hs_compile_SRCS} ${hs_dump_SRCS})
 endif()
 
 # we group things by sublibraries, specifying shared and static and then
@@ -1099,12 +1112,20 @@ if (NOT FAT_RUNTIME)
         add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
-        add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+        add_library(hs_compile OBJECT ${hs_compile_SRCS})
+
+        add_library(hs STATIC
+            src/hs_version.c
+            src/hs_valid_platform.c
+            $<TARGET_OBJECTS:hs_exec>
+            $<TARGET_OBJECTS:hs_compile>)
     endif (BUILD_STATIC_LIBS)
 
     if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
         add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
         set_target_properties(hs_exec_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+        add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+        set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
     endif()
 
 else (FAT_RUNTIME)
@@ -1158,10 +1179,11 @@ else (FAT_RUNTIME)
            $<TARGET_OBJECTS:hs_exec_common>
            ${RUNTIME_LIBS})
        set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+        add_library(hs_compile OBJECT ${hs_compile_SRCS})
 
        # we want the static lib for testing
        add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
-           ${hs_SRCS}
+           $<TARGET_OBJECTS:hs_compile>
            $<TARGET_OBJECTS:hs_exec_common>
            ${RUNTIME_LIBS})
 
@@ -1169,6 +1191,8 @@ else (FAT_RUNTIME)
 
     if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
         # build shared libs
+        add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+        set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
         add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
         set_target_properties(hs_exec_shared_core2 PROPERTIES
@@ -1249,10 +1273,10 @@ endif()
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
     if (NOT FAT_RUNTIME)
         add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
-            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
+            $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
     else()
         add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
-            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common_shared>
+            $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_common_shared>
             ${RUNTIME_SHLIBS})
 
     endif()
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 02b5c3f3..6b6d972a 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -64,7 +64,7 @@ libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 
 The version of PCRE used to validate Hyperscan's interpretation of this syntax
-is 8.40.
+is 8.41.
 
 ====================
 Supported Constructs
diff --git a/doc/dev-reference/runtime.rst b/doc/dev-reference/runtime.rst
index 665395a8..dbfe7633 100644
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -80,6 +80,42 @@ functions for the management of streams:
   another, resetting the destination stream first. This call avoids the
   allocation done by :c:func:`hs_copy_stream`.
 
+==================
+Stream Compression
+==================
+
+A stream object is allocated as a fixed size region of memory which has been
+sized to ensure that no memory allocations are required during scan
+operations. When the system is under memory pressure, it may be useful to reduce
+the memory consumed by streams that are not expected to be used soon. The
+Hyperscan API provides calls for translating a stream to and from a compressed
+representation for this purpose. The compressed representation differs from the
+full stream object as it does not reserve space for components which are not
+required given the current stream state. The Hyperscan API functions for this
+functionality are:
+
+* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
+  representation of the stream and returns the number of bytes consumed by the
+  compressed representation. If the buffer is not large enough to hold the
+  compressed representation, :c:member:`HS_INSUFFICIENT_SPACE` is returned along
+  with the required size. This call does not modify the original stream in any
+  way: it may still be written to with :c:func:`hs_scan_stream`, used as part of
+  the various reset calls to reinitialise its state, or
+  :c:func:`hs_close_stream` may be called to free its resources.
+
+* :c:func:`hs_expand_stream`: creates a new stream based on a buffer containing
+  a compressed representation.
+
+* :c:func:`hs_reset_and_expand_stream`: constructs a stream based on a buffer
+  containing a compressed representation on top of an existing stream, resetting
+  the existing stream first. This call avoids the allocation done by
+  :c:func:`hs_expand_stream`.
+
+Note: it is not recommended to use stream compression between every call to scan
+for performance reasons as it takes time to convert between the compressed
+representation and a standard stream.
+
+
 **********
 Block Mode
 **********
diff --git a/examples/patbench.cc b/examples/patbench.cc
index f82f47a7..20de5745 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -165,6 +165,7 @@ static bool higher_is_better(Criterion c) {
 }
 
 static void print_criterion(Criterion c, double val) {
+    std::ios::fmtflags f(cout.flags());
     switch (c) {
     case CRITERION_THROUGHPUT:
         cout << std::fixed << std::setprecision(3) << val << " Megabits/s";
@@ -179,6 +180,7 @@ static void print_criterion(Criterion c, double val) {
         cout << static_cast<size_t>(val) << " bytes";
         break;
     }
+    cout.flags(f);
 }
 
 // Key for identifying a stream in our pcap input data, using data from its IP
@@ -596,11 +598,13 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
     size_t bytes = bench.bytes();
     size_t matches = bench.matches();
     if (diagnose) {
+        std::ios::fmtflags f(cout.flags());
         cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
              << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
              << std::fixed << std::setprecision(3)
              << (bytes * 8 * repeatCount) / (scan_time * 1000000)
              << " Mbps, Matches " << matches << endl;
+        cout.flags(f);
     }
     return (bytes * 8 * repeatCount) / (scan_time * 1000000);
 }
@@ -755,10 +759,12 @@ int main(int argc, char **argv) {
         for (unsigned i = count; i < 16; i++) {
             cout << " ";
         }
+        std::ios::fmtflags out_f(cout.flags());
         cout << "Performance: ";
         print_criterion(criterion, best);
         cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
              << "x) after cutting:" << endl;
+        cout.flags(out_f);
 
         // s now has factor_max signatures
         for (const auto &found : s) {
diff --git a/src/dispatcher.c b/src/dispatcher.c
index 5ae46b56..c37a984e 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -127,6 +127,16 @@ CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
 CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
                 const size_t length, size_t *deserialized_size);
 
+CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
+                char *buf, size_t buf_space, size_t *used_space);
+
+CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
+                hs_stream_t **stream, const char *buf,size_t buf_size);
+
+CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
+                const char *buf, size_t buf_size, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+
 /** INTERNALS **/
 
 CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 92e75aaa..d33756d3 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -32,6 +32,7 @@
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
 #include "flood_runtime.h"
+#include "scratch.h"
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/arch.h"
@@ -358,7 +359,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
         }
         u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
         confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
-                    last_match_id, confVal);
+                    last_match_id, confVal, conf, bit);
     } while (unlikely(!!*conf));
 }
 
@@ -725,13 +726,17 @@ static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                              const struct FDR_Runtime_Args *a,
                              hwlm_group_t control) {
+    assert(ISALIGNED_CL(fdr));
+
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
     u32 domain_mask_flipped = ~fdr->domainMask;
     u8 stride = fdr->stride;
     const u64a *ft =
-        (const u64a *)((const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)));
-    const u32 *confBase = (const u32 *)((const u8 *)ft + fdr->tabSize);
+        (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
+    assert(ISALIGNED_CL(ft));
+    const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
+    assert(ISALIGNED_CL(confBase));
     struct zone zones[ZONE_MAX];
     assert(fdr->domain > 8 && fdr->domain < 16);
 
@@ -798,14 +803,14 @@ static const FDRFUNCTYPE funcs[] = {
     fdr_engine_exec,
     NULL, /* old: fast teddy */
     NULL, /* old: fast teddy */
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
     fdr_exec_teddy_msks1,
     fdr_exec_teddy_msks1_pck,
     fdr_exec_teddy_msks2,
@@ -820,8 +825,8 @@ static const FDRFUNCTYPE funcs[] = {
 static const u8 fake_history[FAKE_HISTORY_SIZE];
 
 hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
-                     size_t start, HWLMCallback cb, void *ctxt,
-                     hwlm_group_t groups) {
+                     size_t start, HWLMCallback cb,
+                     struct hs_scratch *scratch, hwlm_group_t groups) {
     // We guarantee (for safezone construction) that it is safe to read 16
     // bytes before the end of the history buffer.
     const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
@@ -833,7 +838,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         0,
         start,
         cb,
-        ctxt,
+        scratch,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         0
     };
@@ -847,7 +852,8 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
 
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
-                              size_t start, HWLMCallback cb, void *ctxt,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
                               hwlm_group_t groups) {
     struct FDR_Runtime_Args a = {
         buf,
@@ -856,7 +862,7 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         hlen,
         start,
         cb,
-        ctxt,
+        scratch,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         /* we are guaranteed to always have 16 initialised bytes at the end of
          * the history buffer (they may be garbage). */
diff --git a/src/fdr/fdr.h b/src/fdr/fdr.h
index e2b80056..4dcef851 100644
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@ extern "C" {
 #endif
 
 struct FDR;
+struct hs_scratch;
 
 /**
  * \brief Block-mode scan.
@@ -49,13 +50,13 @@ struct FDR;
  * \param fdr FDR matcher engine.
  * \param buf Buffer to scan.
  * \param len Length of buffer to scan.
- * \param start First offset in buf at which a match may end.
+ * \param start First offset in buf at which a match may start.
  * \param cb Callback to call when a match is found.
- * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param scratch Scratch supplied to callback on match.
  * \param groups Initial groups mask.
  */
 hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
-                     size_t start, HWLMCallback cb, void *ctxt,
+                     size_t start, HWLMCallback cb, struct hs_scratch *scratch,
                      hwlm_group_t groups);
 
 /**
@@ -66,14 +67,15 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
  * \param hlen Length of history buffer (hbuf).
  * \param buf Buffer to scan.
  * \param len Length of buffer to scan (buf).
- * \param start First offset in buf at which a match may end.
+ * \param start First offset in buf at which a match may start.
  * \param cb Callback to call when a match is found.
- * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param scratch Scratch supplied to callback on match.
  * \param groups Initial groups mask.
  */
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
-                              size_t start, HWLMCallback cb, void *ctxt,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
                               hwlm_group_t groups);
 
 #ifdef __cplusplus
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index c4ea50f2..5e3c6a4e 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -42,7 +42,9 @@
 #include "ue2common.h"
 #include "hwlm/hwlm_build.h"
 #include "util/compare.h"
+#include "util/container.h"
 #include "util/dump_mask.h"
+#include "util/make_unique.h"
 #include "util/math.h"
 #include "util/noncopyable.h"
 #include "util/target_info.h"
@@ -50,6 +52,7 @@
 #include "util/verify_types.h"
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cctype>
 #include <cstdio>
@@ -61,6 +64,8 @@
 #include <numeric>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include <boost/multi_array.hpp>
@@ -81,7 +86,6 @@ private:
     bool make_small;
 
     u8 *tabIndexToMask(u32 indexInTable);
-    void assignStringsToBuckets();
 #ifdef DEBUG
     void dumpMasks(const u8 *defaultMask);
 #endif
@@ -90,10 +94,13 @@ private:
     void createInitialState(FDR *fdr);
 
 public:
-    FDRCompiler(vector<hwlmLiteral> lits_in, const FDREngineDescription &eng_in,
+    FDRCompiler(vector<hwlmLiteral> lits_in,
+                map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in,
+                const FDREngineDescription &eng_in,
                 bool make_small_in, const Grey &grey_in)
         : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
-          lits(move(lits_in)), make_small(make_small_in) {}
+          lits(move(lits_in)), bucketToLits(move(bucketToLits_in)),
+          make_small(make_small_in) {}
 
     bytecode_ptr<FDR> build();
 };
@@ -144,61 +151,139 @@ void FDRCompiler::createInitialState(FDR *fdr) {
     }
 }
 
+/**
+ * \brief Lay out FDR structures in bytecode.
+ *
+ * Note that each major structure (header, table, confirm, flood control) is
+ * cacheline-aligned.
+ */
 bytecode_ptr<FDR> FDRCompiler::setupFDR() {
+    auto floodTable = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+    size_t headerSize = sizeof(FDR);
     size_t tabSize = eng.getTabSizeBytes();
 
-    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
-    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
-
-    assert(ISALIGNED_16(tabSize));
-    assert(ISALIGNED_16(confirmTmp.size()));
-    assert(ISALIGNED_16(floodControlTmp.size()));
-    size_t headerSize = ROUNDUP_16(sizeof(FDR));
-    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.size() +
-                             floodControlTmp.size());
+    // Note: we place each major structure here on a cacheline boundary.
+    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(tabSize) +
+                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();
 
     DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
                  "total=%zu\n",
-                 headerSize, tabSize, confirmTmp.size(), floodControlTmp.size(),
+                 headerSize, tabSize, confirmTable.size(), floodTable.size(),
                  size);
 
     auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
     assert(fdr); // otherwise would have thrown std::bad_alloc
 
+    u8 *fdr_base = (u8 *)fdr.get();
+
+    // Write header.
     fdr->size = size;
     fdr->engineID = eng.getID();
     fdr->maxStringLen = verify_u32(maxLen(lits));
-    createInitialState(fdr.get());
-
-    u8 *fdr_base = (u8 *)fdr.get();
-    u8 *ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
-    copy(tab.begin(), tab.end(), ptr);
-    ptr += tabSize;
-
-    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
-    ptr += confirmTmp.size();
-
-    fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
-    ptr += floodControlTmp.size();
-
-    /*  we are allowing domains 9 to 15 only */
-    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->numStrings = verify_u32(lits.size());
+    assert(eng.bits > 8 && eng.bits < 16); // we allow domains 9 to 15 only
     fdr->domain = eng.bits;
     fdr->domainMask = (1 << eng.bits) - 1;
-    fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
+    fdr->tabSize = tabSize;
     fdr->stride = eng.stride;
+    createInitialState(fdr.get());
+
+    // Write table.
+    u8 *ptr = fdr_base + ROUNDUP_CL(sizeof(FDR));
+    assert(ISALIGNED_CL(ptr));
+    copy(tab.begin(), tab.end(), ptr);
+    ptr += ROUNDUP_CL(tabSize);
+
+    // Write confirm structures.
+    assert(ISALIGNED_CL(ptr));
+    fdr->confOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, confirmTable.get(), confirmTable.size());
+    ptr += ROUNDUP_CL(confirmTable.size());
+
+    // Write flood control structures.
+    assert(ISALIGNED_CL(ptr));
+    fdr->floodOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, floodTable.get(), floodTable.size());
+    ptr += floodTable.size(); // last write, no need to round up
 
     return fdr;
 }
 
 //#define DEBUG_ASSIGNMENT
 
-static
-double getScoreUtil(u32 len, u32 count) {
-    return len == 0 ? numeric_limits<double>::max()
-                    : our_pow(count, 1.05) * our_pow(len, -3.0);
-}
+/**
+ * Utility class for computing:
+ *
+ *    score(count, len) = pow(count, 1.05) * pow(len, -3)
+ *
+ * Calling pow() is expensive. This is mitigated by using pre-computed LUTs for
+ * small inputs and a cache for larger ones.
+ */
+class Scorer {
+    unordered_map<u32, double> count_factor_cache;
+
+    // LUT: pow(count, 1.05) for small values of count.
+    static const array<double, 100> count_lut;
+
+    double count_factor(u32 count) {
+        if (count < count_lut.size()) {
+            return count_lut[count];
+        }
+
+        auto it = count_factor_cache.find(count);
+        if (it != count_factor_cache.end()) {
+            return it->second;
+        }
+        double r = our_pow(count, 1.05);
+        count_factor_cache.emplace(count, r);
+        return r;
+    }
+
+    // LUT: pow(len, -3) for len in range [0,8].
+    static const array<double, 9> len_lut;
+
+    double len_factor(u32 len) {
+        assert(len <= len_lut.size());
+        return len_lut[len];
+    }
+
+public:
+    double operator()(u32 len, u32 count) {
+        if (len == 0) {
+            return numeric_limits<double>::max();
+        }
+        return count_factor(count) * len_factor(len);
+    }
+};
+
+const array<double, 100> Scorer::count_lut{{
+    pow(0, 1.05),  pow(1, 1.05),  pow(2, 1.05),  pow(3, 1.05),  pow(4, 1.05),
+    pow(5, 1.05),  pow(6, 1.05),  pow(7, 1.05),  pow(8, 1.05),  pow(9, 1.05),
+    pow(10, 1.05), pow(11, 1.05), pow(12, 1.05), pow(13, 1.05), pow(14, 1.05),
+    pow(15, 1.05), pow(16, 1.05), pow(17, 1.05), pow(18, 1.05), pow(19, 1.05),
+    pow(20, 1.05), pow(21, 1.05), pow(22, 1.05), pow(23, 1.05), pow(24, 1.05),
+    pow(25, 1.05), pow(26, 1.05), pow(27, 1.05), pow(28, 1.05), pow(29, 1.05),
+    pow(30, 1.05), pow(31, 1.05), pow(32, 1.05), pow(33, 1.05), pow(34, 1.05),
+    pow(35, 1.05), pow(36, 1.05), pow(37, 1.05), pow(38, 1.05), pow(39, 1.05),
+    pow(40, 1.05), pow(41, 1.05), pow(42, 1.05), pow(43, 1.05), pow(44, 1.05),
+    pow(45, 1.05), pow(46, 1.05), pow(47, 1.05), pow(48, 1.05), pow(49, 1.05),
+    pow(50, 1.05), pow(51, 1.05), pow(52, 1.05), pow(53, 1.05), pow(54, 1.05),
+    pow(55, 1.05), pow(56, 1.05), pow(57, 1.05), pow(58, 1.05), pow(59, 1.05),
+    pow(60, 1.05), pow(61, 1.05), pow(62, 1.05), pow(63, 1.05), pow(64, 1.05),
+    pow(65, 1.05), pow(66, 1.05), pow(67, 1.05), pow(68, 1.05), pow(69, 1.05),
+    pow(70, 1.05), pow(71, 1.05), pow(72, 1.05), pow(73, 1.05), pow(74, 1.05),
+    pow(75, 1.05), pow(76, 1.05), pow(77, 1.05), pow(78, 1.05), pow(79, 1.05),
+    pow(80, 1.05), pow(81, 1.05), pow(82, 1.05), pow(83, 1.05), pow(84, 1.05),
+    pow(85, 1.05), pow(86, 1.05), pow(87, 1.05), pow(88, 1.05), pow(89, 1.05),
+    pow(90, 1.05), pow(91, 1.05), pow(92, 1.05), pow(93, 1.05), pow(94, 1.05),
+    pow(95, 1.05), pow(96, 1.05), pow(97, 1.05), pow(98, 1.05), pow(99, 1.05),
+}};
+
+const array<double, 9> Scorer::len_lut{{
+    pow(0, -3.0), pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0),
+    pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}};
 
 /**
  * Returns true if the two given literals should be placed in the same chunk as
@@ -297,7 +382,10 @@ next_literal:
     return chunks;
 }
 
-void FDRCompiler::assignStringsToBuckets() {
+static
+map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets(
+                                    vector<hwlmLiteral> &lits,
+                                    const FDREngineDescription &eng) {
     const double MAX_SCORE = numeric_limits<double>::max();
 
     assert(!lits.empty()); // Shouldn't be called with no literals.
@@ -340,12 +428,14 @@ void FDRCompiler::assignStringsToBuckets() {
     boost::multi_array<pair<double, u32>, 2> t(
         boost::extents[numChunks][numBuckets]);
 
+    Scorer scorer;
+
     for (u32 j = 0; j < numChunks; j++) {
         u32 cnt = 0;
         for (u32 k = j; k < numChunks; ++k) {
             cnt += chunks[k].count;
         }
-        t[j][0] = {getScoreUtil(chunks[j].length, cnt), 0};
+        t[j][0] = {scorer(chunks[j].length, cnt), 0};
     }
 
     for (u32 i = 1; i < numBuckets; i++) {
@@ -353,7 +443,7 @@ void FDRCompiler::assignStringsToBuckets() {
             pair<double, u32> best = {MAX_SCORE, 0};
             u32 cnt = chunks[j].count;
             for (u32 k = j + 1; k < numChunks - 1; k++) {
-                auto score = getScoreUtil(chunks[j].length, cnt);
+                auto score = scorer(chunks[j].length, cnt);
                 if (score > best.first) {
                     break; // now worse locally than our best score, give up
                 }
@@ -381,6 +471,7 @@ void FDRCompiler::assignStringsToBuckets() {
 
     // our best score is in t[0][N_BUCKETS-1] and we can follow the links
     // to find where our buckets should start and what goes into them
+    vector<vector<LiteralIndex>> buckets;
     for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) {
         u32 j = t[i][n - 1].second;
         if (j == 0) {
@@ -391,21 +482,33 @@ void FDRCompiler::assignStringsToBuckets() {
         u32 first_id = chunks[i].first_id;
         u32 last_id = chunks[j].first_id;
         assert(first_id < last_id);
-        u32 bucket = numBuckets - n;
         UNUSED const auto &first_lit = lits[first_id];
         UNUSED const auto &last_lit = lits[last_id - 1];
-        DEBUG_PRINTF("placing [%u-%u) in bucket %u (%u lits, len %zu-%zu, "
-                      "score %0.4f)\n",
-                      first_id, last_id, bucket, last_id - first_id,
-                      first_lit.s.length(), last_lit.s.length(),
-                      getScoreUtil(first_lit.s.length(), last_id - first_id));
+        DEBUG_PRINTF("placing [%u-%u) in one bucket (%u lits, len %zu-%zu, "
+                     "score %0.4f)\n",
+                     first_id, last_id, last_id - first_id,
+                     first_lit.s.length(), last_lit.s.length(),
+                     scorer(first_lit.s.length(), last_id - first_id));
 
-        auto &bucket_lits = bucketToLits[bucket];
-        for (u32 k = first_id; k < last_id; k++) {
-            bucket_lits.push_back(k);
+        vector<LiteralIndex> litIds;
+        u32 cnt = last_id - first_id;
+        // long literals first for included literals checking
+        for (u32 k = 0; k < cnt; k++) {
+            litIds.push_back(last_id - k - 1);
         }
+
         i = j;
+        buckets.push_back(litIds);
     }
+
+    // reverse bucket id, longer literals come first
+    map<BucketIndex, vector<LiteralIndex>> bucketToLits;
+    size_t bucketCnt = buckets.size();
+    for (size_t i = 0; i < bucketCnt; i++) {
+        bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i]));
+    }
+
+    return bucketToLits;
 }
 
 #ifdef DEBUG
@@ -426,7 +529,7 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                                const vector<LiteralIndex> &vl,
                                const vector<hwlmLiteral> &lits,
                                SuffixPositionInString pos,
-                               std::map<u32, ue2::unordered_set<u32> > &m2) {
+                               map<u32, unordered_set<u32>> &m2) {
     assert(eng.bits < 32);
 
     u32 distance = 0;
@@ -497,7 +600,7 @@ void FDRCompiler::setupTab() {
         SuffixPositionInString pLimit = eng.getBucketWidth(b);
         for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
             u32 bit = eng.getSchemeBit(b, pos);
-            map<u32, ue2::unordered_set<u32>> m2;
+            map<u32, unordered_set<u32>> m2;
             bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
             if (done) {
                 clearbit(&defaultMask[0], bit);
@@ -505,7 +608,7 @@ void FDRCompiler::setupTab() {
             }
             for (const auto &elem : m2) {
                 u32 dc = elem.first;
-                const ue2::unordered_set<u32> &mskSet = elem.second;
+                const unordered_set<u32> &mskSet = elem.second;
                 u32 v = ~dc;
                 do {
                     u32 b2 = v & dc;
@@ -529,24 +632,222 @@ void FDRCompiler::setupTab() {
 }
 
 bytecode_ptr<FDR> FDRCompiler::build() {
-    assignStringsToBuckets();
     setupTab();
     return setupFDR();
 }
 
+static
+bool isSuffix(const hwlmLiteral &lit1, const hwlmLiteral &lit2) {
+    const auto &s1 = lit1.s;
+    const auto &s2 = lit2.s;
+    size_t len1 = s1.length();
+    size_t len2 = s2.length();
+    assert(len1 >= len2);
+
+    if (lit1.nocase || lit2.nocase) {
+        return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2,
+            [](char a, char b) { return mytoupper(a) == mytoupper(b); });
+    } else {
+        return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2);
+    }
+}
+
+/*
+ * if lit2 is a suffix of lit1 but the case sensitivity, groups or mask info
+ * of lit2 is a subset of lit1, then lit1 can't squash lit2 and lit2 can
+ * possibly match when lit1 matches. In this case, we can't do bucket
+ * squashing. e.g. AAA(no case) in bucket 0, AA(no case) and aa in bucket 1,
+ * we can't squash bucket 1 if we have input like "aaa" as aa can also match.
+ */
+static
+bool includedCheck(const hwlmLiteral &lit1, const hwlmLiteral &lit2) {
+    /* lit1 is caseless and lit2 is case sensitive */
+    if ((lit1.nocase && !lit2.nocase)) {
+        return true;
+    }
+
+    /* lit2's group is a subset of lit1 */
+    if (lit1.groups != lit2.groups &&
+        (lit2.groups == (lit1.groups & lit2.groups))) {
+        return true;
+    }
+
+    /* TODO: narrow down cases for mask check */
+    if (lit1.cmp != lit2.cmp || lit1.msk != lit2.msk) {
+        return true;
+    }
+
+    return false;
+}
+
+/*
+ * if lit2 is an included literal of both lit0 and lit1, then lit0 and lit1
+ * shouldn't match at the same offset, otherwise we give up squashing for lit1.
+ * e.g. lit0:AAA(no case), lit1:aa, lit2:A(no case). We can have duplicate
+ * matches for input "aaa" if lit0 and lit1 both squash lit2.
+ */
+static
+bool checkParentLit(
+            const vector<hwlmLiteral> &lits, u32 pos1,
+            const unordered_set<u32> &parent_map,
+            const unordered_map<u32, unordered_set<u32>> &exception_map) {
+    assert(pos1 < lits.size());
+    const auto &lit1 = lits[pos1];
+    for (const auto pos2 : parent_map) {
+        if (contains(exception_map, pos2)) {
+            const auto &exception_pos = exception_map.at(pos2);
+            if (contains(exception_pos, pos1)) {
+                return false;
+            }
+        }
+
+        /* if lit1 isn't an exception of lit2, then we have to do further
+         * exclusive check.
+         * TODO: More mask checks. Note if two literals are group exclusive,
+         * it is possible that they match at the same offset. */
+        assert(pos2 < lits.size());
+        const auto &lit2 = lits[pos2];
+        if (isSuffix(lit2, lit1)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static
+void buildSquashMask(vector<hwlmLiteral> &lits, u32 id1, u32 bucket1,
+                     size_t start, const vector<pair<u32, u32>> &group,
+                     unordered_map<u32, unordered_set<u32>> &parent_map,
+                     unordered_map<u32, unordered_set<u32>> &exception_map) {
+    auto &lit1 = lits[id1];
+    DEBUG_PRINTF("b:%u len:%zu\n", bucket1, lit1.s.length());
+
+    size_t cnt = group.size();
+    bool included = false;
+    bool exception = false;
+    u32 child_id = ~0U;
+    for (size_t i = start; i < cnt; i++) {
+        u32 bucket2 = group[i].first;
+        assert(bucket2 >= bucket1);
+
+        u32 id2 = group[i].second;
+        auto &lit2 = lits[id2];
+        // check if lit2 is a suffix of lit1
+        if (isSuffix(lit1, lit2)) {
+            /* if we have a included literal in the same bucket,
+             * quit and let the included literal to do possible squashing */
+            if (bucket1 == bucket2) {
+                DEBUG_PRINTF("same bucket\n");
+                return;
+            }
+            /* if lit2 is a suffix but doesn't pass included checks for
+             * extra info, we give up sqaushing */
+            if (includedCheck(lit1, lit2)) {
+                DEBUG_PRINTF("find exceptional suffix %u\n", lit2.id);
+                exception_map[id1].insert(id2);
+                exception = true;
+            } else if (checkParentLit(lits, id1, parent_map[id2],
+                       exception_map)) {
+                if (lit1.included_id == INVALID_LIT_ID) {
+                    DEBUG_PRINTF("find suffix lit1 %u lit2 %u\n",
+                                 lit1.id, lit2.id);
+                    lit1.included_id = lit2.id;
+                } else {
+                    /* if we have multiple included literals in one bucket,
+                     * give up squashing. */
+                    DEBUG_PRINTF("multiple included literals\n");
+                    lit1.included_id = INVALID_LIT_ID;
+                    return;
+                }
+                child_id = id2;
+                included = true;
+            }
+        }
+
+        size_t next = i + 1;
+        u32 nextBucket = next < cnt ? group[next].first : ~0U;
+        if (bucket2 != nextBucket) {
+            if (included) {
+                if (exception) {
+                    /* give up if we have exception literals
+                     * in the same bucket as the included literal. */
+                    lit1.included_id = INVALID_LIT_ID;
+                } else {
+                    parent_map[child_id].insert(id1);
+
+                    lit1.squash |= 1U << bucket2;
+                    DEBUG_PRINTF("build squash mask %2x for %u\n",
+                                 lit1.squash, lit1.id);
+                }
+                return;
+            }
+            exception = false;
+        }
+    }
+}
+
+static constexpr u32 INCLUDED_LIMIT = 1000;
+
+static
+void findIncludedLits(vector<hwlmLiteral> &lits,
+                      const vector<vector<pair<u32, u32>>> &lastCharMap) {
+    /* Map for finding the positions of literal which includes a literal
+     * in FDR hwlm literal vector. */
+    unordered_map<u32, unordered_set<u32>> parent_map;
+
+    /* Map for finding the positions of exception literals which could
+     * sometimes match if a literal matches in FDR hwlm literal vector. */
+    unordered_map<u32, unordered_set<u32>> exception_map;
+    for (const auto &group : lastCharMap) {
+        size_t cnt = group.size();
+        if (cnt > INCLUDED_LIMIT) {
+            continue;
+        }
+        for (size_t i = 0; i < cnt; i++) {
+            u32 bucket1 = group[i].first;
+            u32 id1 = group[i].second;
+            buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map,
+                            exception_map);
+        }
+    }
+}
+
+static
+void addIncludedInfo(
+               vector<hwlmLiteral> &lits, u32 nBuckets,
+               map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
+    vector<vector<pair<u32, u32>>> lastCharMap(256);
+
+    for (BucketIndex b = 0; b < nBuckets; b++) {
+        if (!bucketToLits[b].empty()) {
+            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+                const auto &lit = lits[lit_idx];
+                u8 c = mytoupper(lit.s.back());
+                lastCharMap[c].emplace_back(b, lit_idx);
+            }
+        }
+    }
+
+    findIncludedLits(lits, lastCharMap);
+}
+
 } // namespace
 
 static
-bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
-                                        bool make_small, const target_t &target,
-                                        const Grey &grey, u32 hint) {
+unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType,
+                                            vector<hwlmLiteral> &lits,
+                                            bool make_small,
+                                            const target_t &target,
+                                            const Grey &grey, u32 hint) {
     DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
 
     if (grey.fdrAllowTeddy) {
-        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, grey);
-        if (fdr) {
+        auto proto = teddyBuildProtoHinted(engType, lits, make_small, hint,
+                                           target);
+        if (proto) {
             DEBUG_PRINTF("build with teddy succeeded\n");
-            return fdr;
+            return proto;
         } else {
             DEBUG_PRINTF("build with teddy failed, will try with FDR\n");
         }
@@ -564,23 +865,47 @@ bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
         des->stride = 1;
     }
 
-    FDRCompiler fc(lits, *des, make_small, grey);
+    auto bucketToLits = assignStringsToBuckets(lits, *des);
+    addIncludedInfo(lits, des->getNumBuckets(), bucketToLits);
+    auto proto =
+        ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits,
+                                    make_small);
+    return proto;
+}
+
+unique_ptr<HWLMProto> fdrBuildProto(u8 engType, vector<hwlmLiteral> lits,
+                                    bool make_small, const target_t &target,
+                                    const Grey &grey) {
+    return fdrBuildProtoInternal(engType, lits, make_small, target, grey,
+                                 HINT_INVALID);
+}
+
+static
+bytecode_ptr<FDR> fdrBuildTableInternal(const HWLMProto &proto,
+                                        const Grey &grey) {
+
+    if (proto.teddyEng) {
+        return teddyBuildTable(proto, grey);
+    }
+
+    FDRCompiler fc(proto.lits, proto.bucketToLits, *(proto.fdrEng),
+                   proto.make_small, grey);
     return fc.build();
 }
 
-bytecode_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
-                                bool make_small, const target_t &target,
-                                const Grey &grey) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID);
+bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey) {
+    return fdrBuildTableInternal(proto, grey);
 }
 
 #if !defined(RELEASE_BUILD)
 
-bytecode_ptr<FDR> fdrBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                      bool make_small, u32 hint,
-                                      const target_t &target,
-                                      const Grey &grey) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, hint);
+unique_ptr<HWLMProto> fdrBuildProtoHinted(u8 engType,
+                                          vector<hwlmLiteral> lits,
+                                          bool make_small, u32 hint,
+                                          const target_t &target,
+                                          const Grey &grey) {
+    return fdrBuildProtoInternal(engType, lits, make_small, target, grey,
+                                 hint);
 }
 
 #endif
diff --git a/src/fdr/fdr_compile.h b/src/fdr/fdr_compile.h
index 58047600..f0ce4925 100644
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@@ -34,6 +34,7 @@
 #define FDR_COMPILE_H
 
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/bytecode_ptr.h"
 
 #include <vector>
@@ -46,18 +47,23 @@ struct hwlmLiteral;
 struct Grey;
 struct target_t;
 
-bytecode_ptr<FDR> fdrBuildTable(const std::vector<hwlmLiteral> &lits,
-                                bool make_small, const target_t &target,
-                                const Grey &grey);
+bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey);
 
 #if !defined(RELEASE_BUILD)
-
-bytecode_ptr<FDR> fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits,
-                                      bool make_small, u32 hint,
-                                      const target_t &target, const Grey &grey);
-
+std::unique_ptr<HWLMProto> fdrBuildProtoHinted(
+                                          u8 engType,
+                                          std::vector<hwlmLiteral> lits,
+                                          bool make_small, u32 hint,
+                                          const target_t &target,
+                                          const Grey &grey);
 #endif
 
+std::unique_ptr<HWLMProto> fdrBuildProto(
+                                     u8 engType,
+                                     std::vector<hwlmLiteral> lits,
+                                     bool make_small, const target_t &target,
+                                     const Grey &grey);
+
 /** \brief Returns size in bytes of the given FDR engine. */
 size_t fdrSize(const struct FDR *fdr);
 
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index 756fe8e7..3879960a 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -57,10 +57,11 @@ class FDREngineDescription;
 struct hwlmStreamingControl;
 struct Grey;
 
-bytecode_ptr<u8> setupFullConfs(const std::vector<hwlmLiteral> &lits,
-               const EngineDescription &eng,
-               std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
-               bool make_small);
+bytecode_ptr<u8> setupFullConfs(
+      const std::vector<hwlmLiteral> &lits,
+      const EngineDescription &eng,
+      const std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
+      bool make_small);
 
 // all suffixes include an implicit max_bucket_width suffix to ensure that
 // we always read a full-scale flood "behind" us in terms of what's in our
diff --git a/src/fdr/fdr_confirm.h b/src/fdr/fdr_confirm.h
index 6ce85afd..d975747e 100644
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,12 +42,11 @@ u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) {
 #define CONF_TYPE u64a
 #define CONF_HASH_CALL mul_hash_64
 
-typedef enum LitInfoFlags {
-    NoFlags = 0,
-    Caseless = 1,
-    NoRepeat = 2,
-    ComplexConfirm = 4
-} LitInfoFlags;
+/**
+ * \brief Flag indicating this literal doesn't need to be delivered more than
+ * once, used in LitInfo::flags.
+ */
+#define FDR_LIT_FLAG_NOREPEAT   1
 
 /**
  * \brief Structure describing a literal, linked to by FDRConfirm.
@@ -61,12 +60,12 @@ struct LitInfo {
     hwlm_group_t groups;
     u32 id; // literal ID as passed in
     u8 size;
-    u8 flags; /* LitInfoFlags */
+    u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above.
     u8 next;
-    u8 extended_size;
 };
 
 #define FDRC_FLAG_NO_CONFIRM 1
+#define FDRC_FLAG_NOREPEAT   2
 
 /**
  * \brief FDR confirm header.
@@ -79,12 +78,8 @@ struct LitInfo {
 struct FDRConfirm {
     CONF_TYPE andmsk;
     CONF_TYPE mult;
-    u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
-    u32 flags;  // sole meaning is 'non-zero means no-confirm' (that is all)
+    u32 nBits;
     hwlm_group_t groups;
-    u32 soleLitSize;
-    u32 soleLitCmp;
-    u32 soleLitMsk;
 };
 
 static really_inline
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index b14ffb42..c75f8d17 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -35,6 +35,7 @@
 #include "util/alloc.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
+#include "util/container.h"
 #include "util/verify_types.h"
 
 #include <algorithm>
@@ -47,19 +48,6 @@ namespace ue2 {
 
 using BC2CONF = map<BucketIndex, bytecode_ptr<FDRConfirm>>;
 
-// return the number of bytes beyond a length threshold in all strings in lits
-static
-size_t thresholdedSize(const vector<hwlmLiteral> &lits, size_t threshold) {
-    size_t tot = 0;
-    for (const auto &lit : lits) {
-        size_t sz = lit.s.size();
-        if (sz > threshold) {
-            tot += ROUNDUP_N(sz - threshold, 8);
-        }
-    }
-    return tot;
-}
-
 static
 u64a make_u64a_mask(const vector<u8> &v) {
     assert(v.size() <= sizeof(u64a));
@@ -92,19 +80,12 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
         LitInfo &info = tmpLitInfo[i];
         memset(&info, 0, sizeof(info));
         info.id = lit.id;
-        u8 flags = NoFlags;
-        if (lit.nocase) {
-            flags |= Caseless;
-        }
+        u8 flags = 0;
         if (lit.noruns) {
-            flags |= NoRepeat;
-        }
-        if (lit.msk.size() > lit.s.size()) {
-            flags |= ComplexConfirm;
-            info.extended_size = verify_u8(lit.msk.size());
+            flags |= FDR_LIT_FLAG_NOREPEAT;
         }
         info.flags = flags;
-        info.size = verify_u8(lit.s.size());
+        info.size = verify_u8(max(lit.msk.size(), lit.s.size()));
         info.groups = lit.groups;
 
         // these are built up assuming a LE machine
@@ -149,7 +130,12 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
 
 static
 bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
-                                       bool make_small, bool make_confirm) {
+                                       bool make_small) {
+    // Every literal must fit within CONF_TYPE.
+    assert(all_of_in(lits, [](const hwlmLiteral &lit) {
+        return lit.s.size() <= sizeof(CONF_TYPE);
+    }));
+
     vector<LitInfo> tmpLitInfo(lits.size());
     CONF_TYPE andmsk;
     fillLitInfo(lits, tmpLitInfo, andmsk);
@@ -167,40 +153,6 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
     }
 
     CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
-    u32 flags = 0;
-    // we use next three variables for 'confirmless' case to speed-up
-    // confirmation process
-    u32 soleLitSize = 0;
-    u32 soleLitCmp = 0;
-    u32 soleLitMsk = 0;
-
-    if (!make_confirm) {
-        flags = FDRC_FLAG_NO_CONFIRM;
-        if (lits[0].noruns) {
-            flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
-        }
-        mult = 0;
-        soleLitSize = lits[0].s.size() - 1;
-        // we can get to this point only in confirmless case;
-        // it means that we have only one literal per FDRConfirm (no packing),
-        // with no literal mask and size of literal is less or equal
-        // to the number of masks of Teddy engine;
-        // maximum number of masks for Teddy is 4, so the size of
-        // literal is definitely less or equal to size of u32
-        assert(lits[0].s.size() <= sizeof(u32));
-        for (u32 i = 0; i < lits[0].s.size(); i++) {
-            u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
-            u8 c = lits[0].s[lits[0].s.size() - i - 1];
-            if (lits[0].nocase && ourisalpha(c)) {
-                soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
-                soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
-            }
-            else {
-                soleLitCmp |= (u32)c << shiftLoc;
-                soleLitMsk |= (u32)0xff << shiftLoc;
-            }
-        }
-    }
 
     // we can walk the vector and assign elements from the vectors to a
     // map by hash value
@@ -276,12 +228,11 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 #endif
 
     const size_t bitsToLitIndexSize = (1U << nBits) * sizeof(u32);
-    const size_t totalLitSize = thresholdedSize(lits, sizeof(CONF_TYPE));
 
     // this size can now be a worst-case as we can always be a bit smaller
     size_t size = ROUNDUP_N(sizeof(FDRConfirm), alignof(u32)) +
                   ROUNDUP_N(bitsToLitIndexSize, alignof(LitInfo)) +
-                  sizeof(LitInfo) * lits.size() + totalLitSize;
+                  sizeof(LitInfo) * lits.size();
     size = ROUNDUP_N(size, alignof(FDRConfirm));
 
     auto fdrc = make_zeroed_bytecode_ptr<FDRConfirm>(size);
@@ -289,11 +240,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 
     fdrc->andmsk = andmsk;
     fdrc->mult = mult;
-    fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
-    fdrc->flags = flags;
-    fdrc->soleLitSize = soleLitSize;
-    fdrc->soleLitCmp = soleLitCmp;
-    fdrc->soleLitMsk = soleLitMsk;
+    fdrc->nBits = nBits;
 
     fdrc->groups = gm;
 
@@ -345,40 +292,37 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 bytecode_ptr<u8>
 setupFullConfs(const vector<hwlmLiteral> &lits,
                const EngineDescription &eng,
-               map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+               const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
                bool make_small) {
-    bool makeConfirm = true;
     unique_ptr<TeddyEngineDescription> teddyDescr =
         getTeddyDescription(eng.getID());
-    if (teddyDescr) {
-        makeConfirm = teddyDescr->needConfirm(lits);
-    }
 
     BC2CONF bc2Conf;
     u32 totalConfirmSize = 0;
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
-        if (!bucketToLits[b].empty()) {
+        if (contains(bucketToLits, b)) {
             vector<hwlmLiteral> vl;
-            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+            for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
                 vl.push_back(lits[lit_idx]);
             }
 
             DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
-            auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+            auto fc = getFDRConfirm(vl, make_small);
             totalConfirmSize += fc.size();
             bc2Conf.emplace(b, move(fc));
         }
     }
 
     u32 nBuckets = eng.getNumBuckets();
-    u32 totalConfSwitchSize = nBuckets * sizeof(u32);
-    u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
+    u32 totalConfSwitchSize = ROUNDUP_CL(nBuckets * sizeof(u32));
+    u32 totalSize = totalConfSwitchSize + totalConfirmSize;
 
-    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
     u32 *confBase = (u32 *)buf.get();
     u8 *ptr = buf.get() + totalConfSwitchSize;
+    assert(ISALIGNED_CL(ptr));
 
     for (const auto &m : bc2Conf) {
         const BucketIndex &idx = m.first;
diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h
index a0603c92..067e50e2 100644
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@@ -29,6 +29,7 @@
 #ifndef FDR_CONFIRM_RUNTIME_H
 #define FDR_CONFIRM_RUNTIME_H
 
+#include "scratch.h"
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
 #include "hwlm/hwlm.h"
@@ -41,13 +42,14 @@
 static really_inline
 void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
                  size_t i, hwlmcb_rv_t *control, u32 *last_match,
-                 u64a conf_key) {
+                 u64a conf_key, u64a *conf, u8 bit) {
     assert(i < a->len);
+    assert(i >= a->start_offset);
     assert(ISALIGNED(fdrc));
 
     const u8 * buf = a->buf;
     u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
-                           fdrc->nBitsOrSoleID);
+                           fdrc->nBits);
     u32 start = getConfirmLitIndex(fdrc)[c];
     if (likely(!start)) {
         return;
@@ -56,6 +58,10 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
     const struct LitInfo *li
         = (const struct LitInfo *)((const u8 *)fdrc + start);
 
+    struct hs_scratch *scratch = a->scratch;
+    assert(!scratch->fdr_conf);
+    scratch->fdr_conf = conf;
+    scratch->fdr_conf_offset = bit;
     u8 oldNext; // initialized in loop
     do {
         assert(ISALIGNED(li));
@@ -64,7 +70,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             goto out;
         }
 
-        if ((*last_match == li->id) && (li->flags & NoRepeat)) {
+        if ((*last_match == li->id) && (li->flags & FDR_LIT_FLAG_NOREPEAT)) {
             goto out;
         }
 
@@ -86,99 +92,13 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             goto out;
         }
 
-        if (unlikely(li->flags & ComplexConfirm)) {
-            const u8 *loc2 = buf + i - li->extended_size + 1;
-            if (loc2 < buf) {
-                u32 full_overhang = buf - loc2;
-                size_t len_history = a->len_history;
-                if (full_overhang > len_history) {
-                    goto out;
-                }
-            }
-        }
-
         *last_match = li->id;
-        *control = a->cb(loc - buf, i, li->id, a->ctxt);
+        *control = a->cb(i, li->id, scratch);
     out:
         oldNext = li->next; // oldNext is either 0 or an 'adjust' value
         li++;
     } while (oldNext);
-}
-
-// 'light-weight' confirmation function which is used by 1-mask Teddy;
-// in the 'confirmless' case it simply calls callback function,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBit1(const struct FDRConfirm *fdrc,
-                  const struct FDR_Runtime_Args *a, size_t i,
-                  hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        u32 id = fdrc->nBitsOrSoleID;
-
-        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
-            return;
-        }
-        *last_match = id;
-        *control = a->cb(i, i, id, a->ctxt);
-    }
-}
-
-// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
-// In the 'confirmless' case it makes fast 32-bit comparison,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBitMany(const struct FDRConfirm *fdrc,
-                     const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
-                     hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (i < a->start_offset) {
-        return;
-    }
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        const u32 id = fdrc->nBitsOrSoleID;
-        const u32 len = fdrc->soleLitSize;
-
-        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
-            return;
-        }
-
-        if (r == VECTORING && len > i - a->start_offset) {
-            if (len > i + a->len_history) {
-                return;
-            }
-
-            u32 cmp = (u32)a->buf[i] << 24;
-
-            if (len <= i) {
-                for (u32 j = 1; j <= len; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-            } else {
-                for (u32 j = 1; j <= i; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-                cmp |= (u32)(a->histBytes >> (40 + i * 8));
-            }
-
-            if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
-               return;
-            }
-        }
-        *last_match = id;
-        *control = a->cb(i - len, i, id, a->ctxt);
-    }
+    scratch->fdr_conf = NULL;
 }
 
 #endif
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index 7e794bb3..f4cd1f44 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,12 @@
 
 #include "fdr_compile.h"
 #include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
 #include "fdr_dump.h"
 #include "fdr_engine_description.h"
 #include "fdr_internal.h"
 #include "teddy_engine_description.h"
+#include "teddy_internal.h"
 #include "ue2common.h"
 
 #include <cstdio>
@@ -43,7 +45,7 @@
 #error No dump support!
 #endif
 
-using std::unique_ptr;
+using namespace std;
 
 namespace ue2 {
 
@@ -58,33 +60,127 @@ bool fdrIsTeddy(const FDR *fdr) {
     return !getFdrDescription(engine);
 }
 
-void fdrPrintStats(const FDR *fdr, FILE *f) {
-    const bool isTeddy = fdrIsTeddy(fdr);
+static
+void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
+    const u32 *lit_index = getConfirmLitIndex(fdrc);
+    u32 num_lits = 1U << fdrc->nBits;
+    u32 lits_used = count_if(lit_index, lit_index + num_lits,
+                             [](u32 idx) { return idx != 0; });
 
-    if (isTeddy) {
-        fprintf(f, "TEDDY:         %u\n", fdr->engineID);
-    } else {
-        fprintf(f, "FDR:           %u\n", fdr->engineID);
+    fprintf(f, "      load    %u/%u (%0.2f%%)\n", lits_used, num_lits,
+            (double)lits_used / (double)(num_lits)*100);
+}
+
+static
+void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
+                  FILE *f) {
+    const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
+    for (u32 i = 0; i < num_confirms; i++) {
+        const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
+        fprintf(f, "    confirm %u\n", i);
+        fprintf(f, "      andmsk  0x%016llx\n", fdrc->andmsk);
+        fprintf(f, "      mult    0x%016llx\n", fdrc->mult);
+        fprintf(f, "      nbits   %u\n", fdrc->nBits);
+        fprintf(f, "      groups  0x%016llx\n", fdrc->groups);
+        dumpLitIndex(fdrc, f);
     }
+}
 
-    if (isTeddy) {
-        auto des = getTeddyDescription(fdr->engineID);
-        if (des) {
-            fprintf(f, "    masks      %u\n", des->numMasks);
-            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
-            fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
-        } else {
-            fprintf(f, "   <unknown engine>\n");
+static
+void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
+    // dump reinforcement masks
+    for (u32 b = 0; b < num_tables; b++) {
+        fprintf(f, "    reinforcement table for bucket %u..%u:\n",
+                   b * 8, b * 8 + 7);
+        for (u32 i = 0; i <= N_CHARS; i++) {
+            fprintf(f, "      0x%02x: ", i);
+            for (u32 j = 0; j < 8; j++) {
+                u8 val = rmsk[b * ((N_CHARS + 1) * 8) + i * 8 + j];
+                for (u32 k = 0; k < 8; k++) {
+                    fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+                }
+                fprintf(f, " ");
+            }
+            fprintf(f, "\n");
         }
-    } else {
-        fprintf(f, "    domain     %u\n", fdr->domain);
-        fprintf(f, "    stride     %u\n", fdr->stride);
+        fprintf(f, "\n");
+    }
+}
+
+static
+void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
+    // dump nibble masks
+    fprintf(f, "    nibble masks:\n");
+    for (u32 i = 0; i < numMasks * 2; i++) {
+        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        for (u32 j = 0; j < 16 * maskWidth; j++) {
+            u8 val = baseMsk[i * 16 * maskWidth + j];
+            for (u32 k = 0; k < 8; k++) {
+                fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            fprintf(f, " ");
+        }
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpTeddy(const Teddy *teddy, FILE *f) {
+    fprintf(f, "TEDDY:         %u\n", teddy->engineID);
+    auto des = getTeddyDescription(teddy->engineID);
+    if (!des) {
+        fprintf(f, "   <unknown engine>\n");
+        return;
     }
 
-    fprintf(f, "    strings    ???\n");
+    fprintf(f, "    masks      %u\n", des->numMasks);
+    fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+    fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
+    fprintf(f, "    strings    %u\n", teddy->numStrings);
+    fprintf(f, "    size       %zu bytes\n", fdrSize((const FDR *)teddy));
+    fprintf(f, "    max length %u\n", teddy->maxStringLen);
+    fprintf(f, "    floodoff   %u (%x)\n", teddy->floodOffset,
+            teddy->floodOffset);
+    fprintf(f, "\n");
+
+    u32 maskWidth = des->getNumBuckets() / 8;
+    size_t headerSize = sizeof(Teddy);
+    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
+    const u8 *teddy_base = (const u8 *)teddy;
+    const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+    const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
+    dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
+    dumpTeddyReinforced(rmsk, maskWidth, f);
+    dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
+}
+
+static
+void dumpFDR(const FDR *fdr, FILE *f) {
+    fprintf(f, "FDR:           %u\n", fdr->engineID);
+    auto des = getFdrDescription(fdr->engineID);
+    if (!des) {
+        fprintf(f, "   <unknown engine>\n");
+        return;
+    }
+
+    fprintf(f, "    domain     %u\n", fdr->domain);
+    fprintf(f, "    stride     %u\n", fdr->stride);
+    fprintf(f, "    strings    %u\n", fdr->numStrings);
     fprintf(f, "    size       %zu bytes\n", fdrSize(fdr));
     fprintf(f, "    max length %u\n", fdr->maxStringLen);
     fprintf(f, "    floodoff   %u (%x)\n", fdr->floodOffset, fdr->floodOffset);
+    fprintf(f, "\n");
+
+    dumpConfirms(fdr, fdr->confOffset, des->getNumBuckets(), f);
+}
+
+void fdrPrintStats(const FDR *fdr, FILE *f) {
+    if (fdrIsTeddy(fdr)) {
+        dumpTeddy((const Teddy *)fdr, f);
+    } else {
+        dumpFDR(fdr, f);
+    }
 }
 
 } // namespace ue2
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h
index 09c5ce86..1c464fe3 100644
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -30,7 +30,6 @@
 #define FDR_ENGINE_DESCRIPTION_H
 
 #include "engine_description.h"
-#include "util/ue2_containers.h"
 
 #include <map>
 #include <memory>
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index a425d78c..c79f61c1 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -36,6 +36,8 @@
 #include "ue2common.h"
 #include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback
 
+struct hs_scratch;
+
 typedef enum {
     NOT_CAUTIOUS, //!< not near a boundary (quantify?)
     VECTORING     //!< potentially vectoring
@@ -56,7 +58,6 @@ struct FDRFlood {
 
     u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids
     hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids
-    u32 len[FDR_FLOOD_MAX_IDS]; //!< lengths to go with the string ids
 };
 
 /** \brief FDR structure.
@@ -69,19 +70,18 @@ struct FDR {
     u32 engineID;
     u32 size;
     u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
     u32 floodOffset;
-
-    u8 stride; /* stride - how frequeuntly the data is consulted by the first
+    u8 stride; /* stride - how frequently the data is consulted by the first
                 * stage matcher */
     u8 domain; /* number of bits used to index into main FDR table. This value
                 * is used only of debugging/asserts. */
     u16 domainMask; /* pre-computed domain mask */
     u32 tabSize; /* pre-computed hashtable size in bytes */
-    u32 pad;
-
-    m128 start; /* initial start state to use at offset 0. The state has been set
-                 * up based on the min length of buckets to reduce the need for
-                 * pointless confirms. */
+    m128 start; /* initial start state to use at offset 0. The state has been
+                 * set up based on the min length of buckets to reduce the need
+                 * for pointless confirms. */
 };
 
 /** \brief FDR runtime arguments.
@@ -97,7 +97,7 @@ struct FDR_Runtime_Args {
     size_t len_history;
     size_t start_offset;
     HWLMCallback cb;
-    void *ctxt;
+    struct hs_scratch *scratch;
     const u8 *firstFloodDetect;
     const u64a histBytes;
 };
diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index 7dcc17d1..ff805ca3 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -82,11 +82,10 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
         fl.ids[fl.idCount] = lit.id;
         fl.allGroups |= lit.groups;
         fl.groups[fl.idCount] = lit.groups;
-        fl.len[fl.idCount] = suffix;
         // when idCount gets to max_ids this flood no longer happens
         // only incremented one more time to avoid arithmetic overflow
         DEBUG_PRINTF("Added Flood for char '%c' suffix=%u len[%hu]=%u\n",
-                                        c, fl.suffix, fl.idCount, suffix);
+                     c, fl.suffix, fl.idCount, suffix);
         fl.idCount++;
    }
 }
@@ -182,8 +181,7 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
         printf("i is %02x fl->idCount is %hd fl->suffix is %d fl->allGroups is "
                "%016llx\n", i, fl.idCount, fl.suffix, fl.allGroups);
         for (u32 j = 0; j < fl.idCount; j++) {
-            printf("j is %d fl.groups[j] %016llx fl.len[j] %d \n", j,
-                   fl.groups[j], fl.len[j]);
+            printf("j is %d fl.groups[j] %016llx\n", j, fl.groups[j]);
         }
     }
 #endif
diff --git a/src/fdr/flood_runtime.h b/src/fdr/flood_runtime.h
index d3f6b3b2..2d5a32d9 100644
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@@ -94,7 +94,7 @@ const u8 * floodDetect(const struct FDR * fdr,
     const u8 * buf = a->buf;
     const size_t len = a->len;
     HWLMCallback cb = a->cb;
-    void * ctxt = a->ctxt;
+    struct hs_scratch *scratch = a->scratch;
 
     const u8 * ptr = *ptrPtr;
     // tryFloodDetect is never put in places where unconditional
@@ -196,120 +196,110 @@ const u8 * floodDetect(const struct FDR * fdr,
                 for (u32 t = 0; t < floodSize && (*control & fl->allGroups);
                      t += 4) {
                     DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]);
-                    u32 len0 = fl->len[0] - 1;
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 0 - len0, i + t + 0, fl->ids[0], ctxt);
+                        *control = cb(i + t + 0, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
                     }
                 }
                 break;
             case 2:
                 for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[0]) {
                         *control =
-                            cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                            cb(i + t + 1, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 2 - len1, i + t + 2, fl->ids[1], ctxt);
+                        *control = cb(i + t + 2, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 3 - len1, i + t + 3, fl->ids[1], ctxt);
+                        *control = cb(i + t + 3, fl->ids[1], scratch);
                     }
                 }
                 break;
             case 3:
                 for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
-                    u32 len2 = fl->len[2] - 1;
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[2]) {
-                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                        *control = cb(i + t, fl->ids[2], scratch);
                     }
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[2]) {
-                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
                     }
                 }
                 break;
             default:
                 // slow generalized loop
                 for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
-                    u32 len2 = fl->len[2] - 1;
-                    u32 len3 = fl->len[3] - 1;
 
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[2]) {
-                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                        *control = cb(i + t, fl->ids[2], scratch);
                     }
                     if (*control & fl->groups[3]) {
-                        *control = cb(i + t - len3, i + t, fl->ids[3], ctxt);
+                        *control = cb(i + t, fl->ids[3], scratch);
                     }
 
                     for (u32 t2 = 4; t2 < fl->idCount; t2++) {
                         if (*control & fl->groups[t2]) {
-                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                            *control = cb(i + t, fl->ids[t2], scratch);
                         }
                     }
 
                     if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                     }
                     if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                     }
                     if (*control & fl->groups[2]) {
-                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
                     }
                     if (*control & fl->groups[3]) {
-                        *control = cb(i + t + 1 - len3, i + t + 1, fl->ids[3], ctxt);
+                        *control = cb(i + t + 1, fl->ids[3], scratch);
                     }
 
                     for (u32 t2 = 4; t2 < fl->idCount; t2++) {
                         if (*control & fl->groups[t2]) {
-                            *control = cb(i + t + 1 - (fl->len[t2] - 1), i + t + 1, fl->ids[t2], ctxt);
+                            *control = cb(i + t + 1, fl->ids[t2], scratch);
                         }
                     }
                 }
@@ -320,7 +310,7 @@ const u8 * floodDetect(const struct FDR * fdr,
                 for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) {
                     for (u32 t2 = 0; t2 < fl->idCount; t2++) {
                         if (*control & fl->groups[t2]) {
-                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                            *control = cb(i + t, fl->ids[t2], scratch);
                         }
                     }
                 }
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index a3f7cfaf..0b3fe28f 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -38,90 +38,588 @@
 #include "util/simd_utils.h"
 
 const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn)                  \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn)                  \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#if defined(HAVE_AVX512) // AVX512 reinforced teddy
+
 #ifdef ARCH_64_BIT
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
-    if (unlikely(isnonzero128(var))) {                                      \
-        u64a lo = movq(var);                                                \
-        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
-        if (unlikely(lo)) {                                                 \
-            conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(hi)) {                                                 \
-            conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u64a part1 = movq(p128_0);                                          \
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
+        u64a part3 = movq(p128_1);                                          \
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
+        u64a part5 = movq(p128_2);                                          \
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
+        u64a part7 = movq(p128_3);                                          \
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
+        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn);         \
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn);         \
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn);         \
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn);         \
     }                                                                       \
-} while (0);
+} while(0)
 #else
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
-    if (unlikely(isnonzero128(var))) {                                      \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m128 p128_0 = extract128from512(var, 0);                            \
+        m128 p128_1 = extract128from512(var, 1);                            \
+        m128 p128_2 = extract128from512(var, 2);                            \
+        m128 p128_3 = extract128from512(var, 3);                            \
+        u32 part1 = movd(p128_0);                                           \
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
+        u32 part5 = movd(p128_1);                                           \
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
+        u32 part9 = movd(p128_2);                                           \
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
+        u32 part13 = movd(p128_3);                                          \
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn);         \
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn);        \
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn);        \
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn);        \
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn);        \
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn);        \
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn);        \
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn);        \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK                                                      \
+    PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr));                          \
+    *c_16 = *(ptr + 15);                                                    \
+    *c_32 = *(ptr + 31);                                                    \
+    *c_48 = *(ptr + 47);                                                    \
+    m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
+                           0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
+    *c_0 = *(ptr + 63)
+
+#define SHIFT_OR_M1                                                         \
+    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
+
+#define SHIFT_OR_M2                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
+                               pshufb_m512(dup_mask[3], hi)),               \
+                         1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
+                               pshufb_m512(dup_mask[5], hi)),               \
+                         2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4                                                         \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
+                               pshufb_m512(dup_mask[7], hi)),               \
+                         3), SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
+                                         const m512 *dup_mask,
+                                         const m512 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M4;
+}
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    PREP_SHUF_MASK;
+    return or512(SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base,                 \
+                         &c_0, &c_16, &c_32, &c_48)
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set4x128(maskBase[0]);                                      \
+    dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set4x128(maskBase[2]);                                      \
+    dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set4x128(maskBase[4]);                                      \
+    dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set4x128(maskBase[6]);                                      \
+    dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 128;                                             \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_16 = 0x100;                                                         \
+    u32 c_32 = 0x100;                                                         \
+    u32 c_48 = 0x100;                                                         \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 64);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 64;                                                 \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 64 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk);                             \
+        CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 64 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 64;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 64 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u64a part1 = movq(lo);                                              \
+        u64a part2 = movq(rshiftbyte_m128(lo, 8));                          \
+        u64a part3 = movq(hi);                                              \
+        u64a part4 = movq(rshiftbyte_m128(hi, 8));                          \
+        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u32 part1 = movd(lo);                                               \
+        u32 part2 = movd(rshiftbyte_m128(lo, 4));                           \
+        u32 part3 = movd(rshiftbyte_m128(lo, 8));                           \
+        u32 part4 = movd(rshiftbyte_m128(lo, 12));                          \
+        u32 part5 = movd(hi);                                               \
+        u32 part6 = movd(rshiftbyte_m128(hi, 4));                           \
+        u32 part7 = movd(rshiftbyte_m128(hi, 8));                           \
+        u32 part8 = movd(rshiftbyte_m128(hi, 12));                          \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
+    m256 lo = and256(val, *lo_mask);                                        \
+    m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK                                                      \
+    PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
+    *c_128 = *(ptr + 15);                                                   \
+    m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+    *c_0 = *(ptr + 31)
+
+#define SHIFT_OR_M1                                                         \
+    or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
+
+#define SHIFT_OR_M2                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo),                \
+                               pshufb_m256(dup_mask[3], hi)),               \
+                         1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo),                \
+                               pshufb_m256(dup_mask[5], hi)),               \
+                         2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo),                \
+                               pshufb_m256(dup_mask[7], hi)),               \
+                         3), SHIFT_OR_M3)
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M4;
+}
+
+static really_inline
+m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set2x128(maskBase[0]);                                      \
+    dup_mask[1] = set2x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set2x128(maskBase[2]);                                      \
+    dup_mask[3] = set2x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set2x128(maskBase[4]);                                      \
+    dup_mask[5] = set2x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set2x128(maskBase[6]);                                      \
+    dup_mask[7] = set2x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m256 lo_mask = set32x8(0xf);                                              \
+    m256 dup_mask[n * 2];                                                     \
+    PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_128 = 0x100;                                                        \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 32;                                                 \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk);                             \
+        CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 32 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#else // not defined HAVE_AVX2
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff128(var, ones128()))) {                                \
+        u64a lo = movq(var);                                                \
+        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
+        CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
+        CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff128(var, ones128()))) {                                \
         u32 part1 = movd(var);                                              \
         u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
         u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
         u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
-        if (unlikely(part1)) {                                              \
-            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part2)) {                                              \
-            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part3)) {                                              \
-            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part4)) {                                              \
-            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
+        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
     }                                                                       \
-} while (0);
+} while(0)
 #endif
 
 static really_inline
@@ -129,8 +627,8 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    return and128(pshufb_m128(maskBase[0 * 2], lo),
-                  pshufb_m128(maskBase[0 * 2 + 1], hi));
+    return or128(pshufb_m128(maskBase[0 * 2], lo),
+                 pshufb_m128(maskBase[0 * 2 + 1], hi));
 }
 
 static really_inline
@@ -140,11 +638,11 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, val);
 
-    m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo),
-                        pshufb_m128(maskBase[1*2+1], hi));
-    m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
+    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+                       pshufb_m128(maskBase[1 * 2 + 1], hi));
+    m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
     *old_1 = res_1;
-    return and128(r, res_shifted_1);
+    return or128(r, res_shifted_1);
 }
 
 static really_inline
@@ -155,11 +653,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
 
-    m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo),
-                        pshufb_m128(maskBase[2*2+1], hi));
-    m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
+    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+                       pshufb_m128(maskBase[2 * 2 + 1], hi));
+    m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
     *old_2 = res_2;
-    return and128(r, res_shifted_2);
+    return or128(r, res_shifted_2);
 }
 
 static really_inline
@@ -170,487 +668,154 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
 
-    m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo),
-                        pshufb_m128(maskBase[3*2+1], hi));
-    m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
+    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+                       pshufb_m128(maskBase[3 * 2 + 1], hi));
+    m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
     *old_3 = res_3;
-    return and128(r, res_shifted_3);
+    return or128(r, res_shifted_3);
 }
 
+#define FDR_EXEC_TEDDY_RES_OLD_1
+
+#define FDR_EXEC_TEDDY_RES_OLD_2                                              \
+    m128 res_old_1 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_3                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_4                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();                                             \
+    m128 res_old_3 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FN_1(mask_base, val)                                        \
+    prep_conf_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FN_2(mask_base, val)                                        \
+    prep_conf_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FN_3(mask_base, val)                                        \
+    prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FN_4(mask_base, val)                                        \
+    prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FN(mask_base, val, n)                                       \
+    PREP_CONF_FN_##n(mask_base, val)
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 32;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    FDR_EXEC_TEDDY_RES_OLD(n_msk);                                            \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 16;                                                 \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk);          \
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 16 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#endif // HAVE_AVX2 HAVE_AVX512
+
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 2);
-
-    m128 res_old_1 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 2);
-
-    m128 res_old_1 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                   a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 3);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 3);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 4);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    m128 res_old_3 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 4);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    m128 res_old_3 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h
index 35756c53..40ae0756 100644
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@@ -73,37 +73,37 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
 
 #if defined(HAVE_AVX2)
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);
 
 #endif /* HAVE_AVX2 */
 
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 299825cc..56ec739f 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -40,10 +40,421 @@
 
 #if defined(HAVE_AVX2)
 
+const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+};
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn)              \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+static really_inline
+const m256 *getMaskBase_fat(const struct Teddy *teddy) {
+    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+#if defined(HAVE_AVX512)
+
+static really_inline
+const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase_fat(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
 do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m512 swap = swap256in512(var);                                      \
+        m512 r = interleave512lo(var, swap);                                \
+        m128 r0 = extract128from512(r, 0);                                  \
+        m128 r1 = extract128from512(r, 1);                                  \
+        u64a part1 = movq(r0);                                              \
+        u64a part2 = extract64from128(r0, 1);                               \
+        u64a part5 = movq(r1);                                              \
+        u64a part6 = extract64from128(r1, 1);                               \
+        r = interleave512hi(var, swap);                                     \
+        r0 = extract128from512(r, 0);                                       \
+        r1 = extract128from512(r, 1);                                       \
+        u64a part3 = movq(r0);                                              \
+        u64a part4 = extract64from128(r0, 1);                               \
+        u64a part7 = movq(r1);                                              \
+        u64a part8 = extract64from128(r1, 1);                               \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn);     \
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn);     \
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn);     \
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn);     \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff512(var, ones512()))) {                                \
+        m512 swap = swap256in512(var);                                      \
+        m512 r = interleave512lo(var, swap);                                \
+        m128 r0 = extract128from512(r, 0);                                  \
+        m128 r1 = extract128from512(r, 1);                                  \
+        u32 part1 = movd(r0);                                               \
+        u32 part2 = extract32from128(r0, 1);                                \
+        u32 part3 = extract32from128(r0, 2);                                \
+        u32 part4 = extract32from128(r0, 3);                                \
+        u32 part9 = movd(r1);                                               \
+        u32 part10 = extract32from128(r1, 1);                               \
+        u32 part11 = extract32from128(r1, 2);                               \
+        u32 part12 = extract32from128(r1, 3);                               \
+        r = interleave512hi(var, swap);                                     \
+        r0 = extract128from512(r, 0);                                       \
+        r1 = extract128from512(r, 1);                                       \
+        u32 part5 = movd(r0);                                               \
+        u32 part6 = extract32from128(r0, 1);                                \
+        u32 part7 = extract32from128(r0, 2);                                \
+        u32 part8 = extract32from128(r0, 3);                                \
+        u32 part13 = movd(r1);                                              \
+        u32 part14 = extract32from128(r1, 1);                               \
+        u32 part15 = extract32from128(r1, 2);                               \
+        u32 part16 = extract32from128(r1, 3);                               \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn);    \
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn);    \
+    }                                                                       \
+} while(0)
+#endif
+
+static really_inline
+m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m256 p_mask256;
+    m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
+                                        buf_history, len_history, nMasks));
+    *p_mask = set2x256(p_mask256);
+    return ret;
+}
+
+#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val)                            \
+    m512 lo = and512(val, *lo_mask);                                        \
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define PREP_FAT_SHUF_MASK                                                  \
+    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr)));            \
+    *c_16 = *(ptr + 15);                                                    \
+    m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16],                      \
+                           0ULL, r_msk_base_hi[*c_0],                       \
+                           0ULL, r_msk_base_lo[*c_16],                      \
+                           0ULL, r_msk_base_lo[*c_0]);                      \
+    *c_0 = *(ptr + 31)
+
+#define FAT_SHIFT_OR_M1                                                     \
+    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
+
+#define FAT_SHIFT_OR_M2                                                     \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
+                               pshufb_m512(dup_mask[3], hi)),               \
+                         1), FAT_SHIFT_OR_M1)
+
+#define FAT_SHIFT_OR_M3                                                     \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
+                               pshufb_m512(dup_mask[5], hi)),               \
+                         2), FAT_SHIFT_OR_M2)
+
+#define FAT_SHIFT_OR_M4                                                     \
+    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
+                               pshufb_m512(dup_mask[7], hi)),               \
+                         3), FAT_SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask,
+                                             const m512 *dup_mask,
+                                             const m512 val) {
+    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
+    return FAT_SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask,
+                                             const m512 *dup_mask,
+                                             const m512 val) {
+    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
+    return FAT_SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask,
+                                             const m512 *dup_mask,
+                                             const m512 val) {
+    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
+    return FAT_SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask,
+                                             const m512 *dup_mask,
+                                             const m512 val) {
+    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
+    return FAT_SHIFT_OR_M4;
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+                            const u8 *ptr, const u64a *r_msk_base_lo,
+                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+    PREP_FAT_SHUF_MASK;
+    return or512(FAT_SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+                            const u8 *ptr, const u64a *r_msk_base_lo,
+                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+    PREP_FAT_SHUF_MASK;
+    return or512(FAT_SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+                            const u8 *ptr, const u64a *r_msk_base_lo,
+                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+    PREP_FAT_SHUF_MASK;
+    return or512(FAT_SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+                            const u8 *ptr, const u64a *r_msk_base_lo,
+                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+    PREP_FAT_SHUF_MASK;
+    return or512(FAT_SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n)                             \
+    prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FAT_FN(ptr, n)                                              \
+    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr,                         \
+                             r_msk_base_lo, r_msk_base_hi, &c_0, &c_16)
+
+/*
+ * In FAT teddy, it needs 2 bytes to represent result of each position,
+ * so each nibble's(for example, lo nibble of last byte) FAT teddy mask
+ * has 16x2 bytes:
+ *   |----------------------------------|----------------------------------|
+ *   16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
+ *                     A                                  B
+ * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
+ *   |----------------------------------|----------------------------------|
+ *   16bytes input data (lo nibbles)    16bytes duplicated data (lo nibbles)
+ *                     X                                  X
+ * then do pshufb_m256(AB, XX).
+ *
+ * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
+ * to 64 bytes:
+ *   |----------------|----------------|----------------|----------------|
+ *            X                Y                X                Y
+ * in this case we need DUP_FAT_MASK to construct AABB:
+ *   |----------------|----------------|----------------|----------------|
+ *            A                A                B                B
+ * then do pshufb_m512(AABB, XYXY).
+ */
+
+#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
+
+#define PREPARE_FAT_MASKS_1                                                   \
+    dup_mask[0] = DUP_FAT_MASK(maskBase[0]);                                  \
+    dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
+
+#define PREPARE_FAT_MASKS_2                                                   \
+    PREPARE_FAT_MASKS_1                                                       \
+    dup_mask[2] = DUP_FAT_MASK(maskBase[2]);                                  \
+    dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
+
+#define PREPARE_FAT_MASKS_3                                                   \
+    PREPARE_FAT_MASKS_2                                                       \
+    dup_mask[4] = DUP_FAT_MASK(maskBase[4]);                                  \
+    dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
+
+#define PREPARE_FAT_MASKS_4                                                   \
+    PREPARE_FAT_MASKS_3                                                       \
+    dup_mask[6] = DUP_FAT_MASK(maskBase[6]);                                  \
+    dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
+
+#define PREPARE_FAT_MASKS(n)                                                  \
+    m512 lo_mask = set64x8(0xf);                                              \
+    m512 dup_mask[n * 2];                                                     \
+    PREPARE_FAT_MASKS_##n
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = ones_u32;                                                \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m256 *maskBase = getMaskBase_fat(teddy);                            \
+    PREPARE_FAT_MASKS(n_msk);                                                 \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk);      \
+    const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1);                \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_16 = 0x100;                                                         \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 32;                                                 \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset,         \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
+        m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk);                         \
+        CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn);                \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 32 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m512 p_mask;                                                          \
+        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end,         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+
+#else // HAVE_AVX512
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
         m256 swap = swap128in256(var);                                      \
         m256 r = interleave256lo(var, swap);                                \
         u64a part1 = extractlow64from256(r);                                \
@@ -51,32 +462,16 @@ do {                                                                        \
         r = interleave256hi(var, swap);                                     \
         u64a part3 = extractlow64from256(r);                                \
         u64a part4 = extract64from256(r, 1);                                \
-        if (unlikely(part1)) {                                              \
-            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part2)) {                                              \
-            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part3)) {                                              \
-            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part4)) {                                              \
-            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
     }                                                                       \
-} while (0);
+} while(0)
 #else
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
 do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
+    if (unlikely(diff256(var, ones256()))) {                                \
         m256 swap = swap128in256(var);                                      \
         m256 r = interleave256lo(var, swap);                                \
         u32 part1 = extractlow32from256(r);                                 \
@@ -88,56 +483,26 @@ do {                                                                        \
         u32 part6 = extract32from256(r, 1);                                 \
         u32 part7 = extract32from256(r, 2);                                 \
         u32 part8 = extract32from256(r, 3);                                 \
-        if (unlikely(part1)) {                                              \
-            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part2)) {                                              \
-            conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-        }                                                                   \
-        if (unlikely(part3)) {                                              \
-            conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part4)) {                                              \
-            conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part5)) {                                              \
-            conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part6)) {                                              \
-            conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr,  \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part7)) {                                              \
-            conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-        if (unlikely(part8)) {                                              \
-            conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr,  \
-                    &control, &last_match);                                 \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
     }                                                                       \
-} while (0);
+} while(0)
 #endif
 
 static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
                        const u8 *buf_history, size_t len_history,
                        const u32 nMasks) {
     m128 p_mask128;
-    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
-                                        len_history, nMasks));
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                        buf_history, len_history, nMasks));
     *p_mask = set2x128(p_mask128);
     return ret;
 }
@@ -147,8 +512,8 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(pshufb_m256(maskBase[0*2], lo),
-                  pshufb_m256(maskBase[0*2+1], hi));
+    return or256(pshufb_m256(maskBase[0 * 2], lo),
+                 pshufb_m256(maskBase[0 * 2 + 1], hi));
 }
 
 static really_inline
@@ -158,11 +523,11 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, val);
 
-    m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
-                        pshufb_m256(maskBase[1*2+1], hi));
-    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
+    m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
+                       pshufb_m256(maskBase[1 * 2 + 1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
     *old_1 = res_1;
-    return and256(r, res_shifted_1);
+    return or256(r, res_shifted_1);
 }
 
 static really_inline
@@ -173,11 +538,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
 
-    m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
-                        pshufb_m256(maskBase[2*2+1], hi));
-    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
+    m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
+                       pshufb_m256(maskBase[2 * 2 + 1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
     *old_2 = res_2;
-    return and256(r, res_shifted_2);
+    return or256(r, res_shifted_2);
 }
 
 static really_inline
@@ -188,504 +553,160 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
 
-    m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
-                        pshufb_m256(maskBase[3*2+1], hi));
-    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
+    m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
+                       pshufb_m256(maskBase[3 * 2 + 1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
     *old_3 = res_3;
-    return and256(r, res_shifted_3);
+    return or256(r, res_shifted_3);
 }
 
-static really_inline
-const m256 * getMaskBase_avx2(const struct Teddy *teddy) {
-    return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy));
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_1                                        \
+do {                                                                        \
+} while(0)
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_2                                        \
+    m256 res_old_1 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_3                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_4                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();                                           \
+    m256 res_old_3 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FAT_FN_1(mask_base, val)                                  \
+    prep_conf_fat_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FAT_FN_2(mask_base, val)                                  \
+    prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FAT_FN_3(mask_base, val)                                  \
+    prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FAT_FN_4(mask_base, val)                                  \
+    prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FAT_FN(mask_base, val, n)                                 \
+    PREP_CONF_FAT_FN_##n(mask_base, val)
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                 \
+do {                                                                        \
+    const u8 *buf_end = a->buf + a->len;                                    \
+    const u8 *ptr = a->buf + a->start_offset;                               \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                 \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                         \
+    u32 last_match = ones_u32;                                              \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                  \
+    const size_t iterBytes = 32;                                            \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",               \
+                 a->buf, a->len, a->start_offset);                          \
+                                                                            \
+    const m256 *maskBase = getMaskBase_fat(teddy);                          \
+    const u32 *confBase = getConfBase(teddy);                               \
+                                                                            \
+    FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk);                                      \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                             \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);         \
+    if (ptr < mainStart) {                                                  \
+        ptr = mainStart - 16;                                               \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,       \
+                                       a->buf, buf_end,                     \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {                 \
+        __builtin_prefetch(ptr + (iterBytes * 4));                          \
+        CHECK_FLOOD;                                                        \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk);  \
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn);              \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    assert(ptr + 16 > buf_end);                                             \
+    if (ptr < buf_end) {                                                    \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,       \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+    }                                                                       \
+                                                                            \
+    return HWLM_SUCCESS;                                                    \
+} while(0)
+
+#endif // HAVE_AVX512
+
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
-static really_inline
-const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) {
-    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
-                         (numMask*32*2));
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 1);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 1);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 2);
-
-    m256 res_old_1 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 2);
-
-    m256 res_old_1 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 3);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 3);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 4);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    m256 res_old_3 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase_avx2(teddy, 4);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    m256 res_old_3 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
 #endif // HAVE_AVX2
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 6f956e8c..9a1e54a1 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -42,10 +42,14 @@
 #include "teddy_engine_description.h"
 #include "grey.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/alloc.h"
 #include "util/compare.h"
+#include "util/container.h"
+#include "util/make_unique.h"
 #include "util/noncopyable.h"
 #include "util/popcount.h"
+#include "util/small_vector.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"
 
@@ -69,38 +73,58 @@ namespace {
 
 //#define TEDDY_DEBUG
 
+/** \brief Max number of Teddy masks we use. */
+static constexpr size_t MAX_NUM_MASKS = 4;
+
 class TeddyCompiler : noncopyable {
     const TeddyEngineDescription &eng;
     const Grey &grey;
     const vector<hwlmLiteral> &lits;
+    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
     bool make_small;
 
 public:
     TeddyCompiler(const vector<hwlmLiteral> &lits_in,
+                  map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in,
                   const TeddyEngineDescription &eng_in, bool make_small_in,
                   const Grey &grey_in)
-        : eng(eng_in), grey(grey_in), lits(lits_in), make_small(make_small_in) {
-    }
+        : eng(eng_in), grey(grey_in), lits(lits_in),
+          bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
 
     bytecode_ptr<FDR> build();
-    bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };
 
 class TeddySet {
+    /**
+     * \brief Estimate of the max number of literals in a set, used to
+     * minimise allocations.
+     */
+    static constexpr size_t LITS_PER_SET = 20;
+
+    /** \brief Number of masks. */
     u32 len;
-    // nibbleSets is a series of bitfields over 16 predicates
-    // that represent the whether shufti nibble set
-    // so for num_masks = 4 we will represent our strings by
-    // 8 u16s in the vector that indicate what a shufti bucket
-    // would have to look like
-    vector<u16> nibbleSets;
-    set<u32> litIds;
+
+    /**
+     * \brief A series of bitfields over 16 predicates that represent the
+     * shufti nibble set.
+     *
+     * So for num_masks = 4 we will represent our strings by 8 u16s in the
+     * vector that indicate what a shufti bucket would have to look like.
+     */
+    small_vector<u16, MAX_NUM_MASKS * 2> nibbleSets;
+
+    /**
+     * \brief Sorted, unique set of literals. We maintain our own set in a
+     * sorted vector to minimise allocations.
+     */
+    small_vector<u32, LITS_PER_SET> litIds;
+
 public:
     explicit TeddySet(u32 len_in) : len(len_in), nibbleSets(len_in * 2, 0) {}
-    const set<u32> & getLits() const { return litIds; }
     size_t litCount() const { return litIds.size(); }
+    const small_vector<u32, LITS_PER_SET> &getLits() const { return litIds; }
 
-    bool operator<(const TeddySet & s) const {
+    bool operator<(const TeddySet &s) const {
         return litIds < s.litIds;
     }
 
@@ -116,11 +140,11 @@ public:
             printf("%u ", id);
         }
         printf("\n");
-        printf("Flood prone : %s\n", isRunProne()?"yes":"no");
+        printf("Flood prone : %s\n", isRunProne() ? "yes" : "no");
     }
 #endif
 
-    bool identicalTail(const TeddySet & ts) const {
+    bool identicalTail(const TeddySet &ts) const {
         return nibbleSets == ts.nibbleSets;
     }
 
@@ -131,24 +155,19 @@ public:
                 u8 c = s[s.size() - i - 1];
                 u8 c_hi = (c >> 4) & 0xf;
                 u8 c_lo = c & 0xf;
-                nibbleSets[i*2] = 1 << c_lo;
+                nibbleSets[i * 2] = 1 << c_lo;
                 if (lit.nocase && ourisalpha(c)) {
-                    nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
+                    nibbleSets[i * 2 + 1] =
+                        (1 << (c_hi & 0xd)) | (1 << (c_hi | 0x2));
                 } else {
-                    nibbleSets[i*2+1] =  1 << c_hi;
+                    nibbleSets[i * 2 + 1] = 1 << c_hi;
                 }
             } else {
-                nibbleSets[i*2] = nibbleSets[i*2+1] = 0xffff;
+                nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff;
             }
         }
-        litIds.insert(lit_id);
-    }
-
-    void merge(const TeddySet &ts) {
-        for (u32 i = 0; i < nibbleSets.size(); i++) {
-            nibbleSets[i] |= ts.nibbleSets[i];
-        }
-        litIds.insert(ts.litIds.begin(), ts.litIds.end());
+        litIds.push_back(lit_id);
+        sort_and_unique(litIds);
     }
 
     // return a value p from 0 .. MAXINT64 that gives p/MAXINT64
@@ -167,15 +186,15 @@ public:
     // a small fixed cost + the cost of traversing some sort of followup
     // (assumption is that the followup is linear)
     u64a heuristic() const {
-        return probability() * (2+litCount());
+        return probability() * (2 + litCount());
     }
 
     bool isRunProne() const {
         u16 lo_and = 0xffff;
         u16 hi_and = 0xffff;
         for (u32 i = 0; i < len; i++) {
-            lo_and &= nibbleSets[i*2];
-            hi_and &= nibbleSets[i*2+1];
+            lo_and &= nibbleSets[i * 2];
+            hi_and &= nibbleSets[i * 2 + 1];
         }
         // we're not flood-prone if there's no way to get
         // through with a flood
@@ -184,10 +203,27 @@ public:
         }
         return true;
     }
+
+    friend TeddySet merge(const TeddySet &a, const TeddySet &b) {
+        assert(a.nibbleSets.size() == b.nibbleSets.size());
+
+        TeddySet m(a);
+
+        for (size_t i = 0; i < m.nibbleSets.size(); i++) {
+            m.nibbleSets[i] |= b.nibbleSets[i];
+        }
+
+        m.litIds.insert(m.litIds.end(), b.litIds.begin(), b.litIds.end());
+        sort_and_unique(m.litIds);
+
+        return m;
+    }
 };
 
-bool TeddyCompiler::pack(map<BucketIndex,
-                             std::vector<LiteralIndex> > &bucketToLits) {
+static
+bool pack(const vector<hwlmLiteral> &lits,
+          const TeddyEngineDescription &eng,
+          map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits) {
     set<TeddySet> sts;
 
     for (u32 i = 0; i < lits.size(); i++) {
@@ -200,7 +236,8 @@ bool TeddyCompiler::pack(map<BucketIndex,
 #ifdef TEDDY_DEBUG
         printf("Size %zu\n", sts.size());
         for (const TeddySet &ts : sts) {
-            printf("\n"); ts.dump();
+            printf("\n");
+            ts.dump();
         }
         printf("\n===============================================\n");
 #endif
@@ -220,9 +257,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
                     continue;
                 }
 
-                TeddySet tmpSet(eng.numMasks);
-                tmpSet.merge(s1);
-                tmpSet.merge(s2);
+                TeddySet tmpSet = merge(s1, s2);
                 u64a newScore = tmpSet.heuristic();
                 u64a oldScore = s1.heuristic() + s2.heuristic();
                 if (newScore < oldScore) {
@@ -250,9 +285,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
         }
 
         // do the merge
-        TeddySet nts(eng.numMasks);
-        nts.merge(*m1);
-        nts.merge(*m2);
+        TeddySet nts = merge(*m1, *m2);
 #ifdef TEDDY_DEBUG
         printf("Merging\n");
         printf("m1 = \n");
@@ -282,66 +315,51 @@ bool TeddyCompiler::pack(map<BucketIndex,
     return true;
 }
 
-bytecode_ptr<FDR> TeddyCompiler::build() {
-    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
-        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
-        return nullptr;
-    }
+// this entry has all-zero mask to skip reinforcement
+#define NO_REINFORCEMENT N_CHARS
 
-#ifdef TEDDY_DEBUG
-    for (size_t i = 0; i < lits.size(); i++) {
-        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
-               lits[i].nocase ? "caseless" : "caseful");
-        for (size_t j = 0; j < lits[i].s.size(); j++) {
-            printf("%02x", ((u32)lits[i].s[j])&0xff);
-        }
-        printf("\n");
-    }
-#endif
+// this means every entry in reinforcement table
+#define ALL_CHAR_SET N_CHARS
 
-    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
-    if(eng.needConfirm(lits)) {
-        if (!pack(bucketToLits)) {
-            DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
-                         lits.size(), eng.getNumBuckets());
-            return nullptr;
+// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
+#define REINFORCED_MSK_LEN 8
+
+// reinforcement table size for each 8 buckets set
+#define RTABLE_SIZE ((N_CHARS + 1) * REINFORCED_MSK_LEN)
+
+static
+void initReinforcedTable(u8 *rmsk) {
+    u64a *mask = (u64a *)rmsk;
+    fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
+}
+
+static
+void fillReinforcedMskZero(u8 *rmsk) {
+    u8 *mc = rmsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
+    fill_n(mc, REINFORCED_MSK_LEN, 0x00);
+}
+
+static
+void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
+    assert(j > 0);
+    if (c == ALL_CHAR_SET) {
+        for (size_t i = 0; i < N_CHARS; i++) {
+            u8 *mc = rmsk + i * REINFORCED_MSK_LEN;
+            mc[j - 1] &= ~bmsk;
         }
     } else {
-        for (u32 i = 0; i < lits.size(); i++) {
-            bucketToLits[i].push_back(i);
-        }
+        u8 *mc = rmsk + c * REINFORCED_MSK_LEN;
+        mc[j - 1] &= ~bmsk;
     }
-    u32 maskWidth = eng.getNumBuckets() / 8;
+}
 
-    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-
-    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
-    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
-
-    size_t size = ROUNDUP_N(sizeof(Teddy) +
-                            maskLen +
-                            confirmTmp.size() +
-                            floodControlTmp.size(),
-                            16 * maskWidth);
-
-    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
-    assert(fdr); // otherwise would have thrown std::bad_alloc
-    Teddy *teddy = (Teddy *)fdr.get(); // ugly
-    u8 *teddy_base = (u8 *)teddy;
-
-    teddy->size = size;
-    teddy->engineID = eng.getID();
-    teddy->maxStringLen = verify_u32(maxLen(lits));
-
-    u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
-    ptr += confirmTmp.size();
-
-    teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
-    ptr += floodControlTmp.size();
-
-    u8 *baseMsk = teddy_base + sizeof(Teddy);
+static
+void fillNibbleMasks(const map<BucketIndex,
+                               vector<LiteralIndex>> &bucketToLits,
+                     const vector<hwlmLiteral> &lits,
+                     u32 numMasks, u32 maskWidth, size_t maskLen,
+                     u8 *baseMsk) {
+    memset(baseMsk, 0xff, maskLen);
 
     for (const auto &b2l : bucketToLits) {
         const u32 &bucket_id = b2l.first;
@@ -354,16 +372,18 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
             const u32 sz = verify_u32(l.s.size());
 
             // fill in masks
-            for (u32 j = 0; j < eng.numMasks; j++) {
-                u32 msk_id_lo = j * 2 * maskWidth + (bucket_id  / 8);
-                u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id  / 8);
+            for (u32 j = 0; j < numMasks; j++) {
+                const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
+                const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
+                const u32 lo_base = msk_id_lo * 16;
+                const u32 hi_base = msk_id_hi * 16;
 
                 // if we don't have a char at this position, fill in i
                 // locations in these masks with '1'
                 if (j >= sz) {
                     for (u32 n = 0; n < 16; n++) {
-                        baseMsk[msk_id_lo * 16 + n] |= bmsk;
-                        baseMsk[msk_id_hi * 16 + n] |= bmsk;
+                        baseMsk[lo_base + n] &= ~bmsk;
+                        baseMsk[hi_base + n] &= ~bmsk;
                     }
                 } else {
                     u8 c = l.s[sz - 1 - j];
@@ -382,51 +402,173 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
                         for (u8 cm = 0; cm < 0x10; cm++) {
                             if ((cm & m_lo) == (cmp_lo & m_lo)) {
-                                baseMsk[msk_id_lo * 16 + cm] |= bmsk;
+                                baseMsk[lo_base + cm] &= ~bmsk;
                             }
                             if ((cm & m_hi) == (cmp_hi & m_hi)) {
-                                baseMsk[msk_id_hi * 16 + cm] |= bmsk;
+                                baseMsk[hi_base + cm] &= ~bmsk;
                             }
                         }
-                    } else{
+                    } else {
                         if (l.nocase && ourisalpha(c)) {
                             u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
-                            u32 cmHalfSet   = (0x20 >> hiShift) & 0xf;
-                            baseMsk[msk_id_hi * 16 + (n_hi & cmHalfClear)] |= bmsk;
-                            baseMsk[msk_id_hi * 16 + (n_hi | cmHalfSet  )] |= bmsk;
+                            u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
+                            baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
                         } else {
-                            baseMsk[msk_id_hi * 16 + n_hi] |= bmsk;
+                            baseMsk[hi_base + n_hi] &= ~bmsk;
                         }
-                        baseMsk[msk_id_lo * 16 + n_lo] |= bmsk;
+                        baseMsk[lo_base + n_lo] &= ~bmsk;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static
+void fillReinforcedTable(const map<BucketIndex,
+                                   vector<LiteralIndex>> &bucketToLits,
+                         const vector<hwlmLiteral> &lits,
+                         u8 *rtable_base, const u32 num_tables) {
+    vector<u8 *> tables;
+    for (u32 i = 0; i < num_tables; i++) {
+        tables.push_back(rtable_base + i * RTABLE_SIZE);
+    }
+
+    for (auto t : tables) {
+        initReinforcedTable(t);
+    }
+
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
+        u8 *rmsk = tables[bucket_id / 8];
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in reinforced masks
+            for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
+                if (sz - 1 < j) {
+                    fillReinforcedMsk(rmsk, ALL_CHAR_SET, j, bmsk);
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    if (l.nocase && ourisalpha(c)) {
+                        u8 c_up = c & 0xdf;
+                        fillReinforcedMsk(rmsk, c_up, j, bmsk);
+                        u8 c_lo = c | 0x20;
+                        fillReinforcedMsk(rmsk, c_lo, j, bmsk);
+                    } else {
+                        fillReinforcedMsk(rmsk, c, j, bmsk);
                     }
                 }
             }
         }
     }
 
+    for (auto t : tables) {
+        fillReinforcedMskZero(t);
+    }
+}
+
+bytecode_ptr<FDR> TeddyCompiler::build() {
+    u32 maskWidth = eng.getNumBuckets() / 8;
+
+    size_t headerSize = sizeof(Teddy);
+    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+    size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth;
+
+    auto floodTable = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+    // Note: we place each major structure here on a cacheline boundary.
+    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+                  ROUNDUP_CL(reinforcedMaskLen) +
+                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();
+
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+    Teddy *teddy = (Teddy *)fdr.get(); // ugly
+    u8 *teddy_base = (u8 *)teddy;
+
+    // Write header.
+    teddy->size = size;
+    teddy->engineID = eng.getID();
+    teddy->maxStringLen = verify_u32(maxLen(lits));
+    teddy->numStrings = verify_u32(lits.size());
+
+    // Write confirm structures.
+    u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+              ROUNDUP_CL(reinforcedMaskLen);
+    assert(ISALIGNED_CL(ptr));
+    teddy->confOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, confirmTable.get(), confirmTable.size());
+    ptr += ROUNDUP_CL(confirmTable.size());
+
+    // Write flood control structures.
+    assert(ISALIGNED_CL(ptr));
+    teddy->floodOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, floodTable.get(), floodTable.size());
+    ptr += floodTable.size();
+
+    // Write teddy masks.
+    u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+    fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
+                    baseMsk);
+
+    // Write reinforcement masks.
+    u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+    fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+
+    return fdr;
+}
+
+
+static
+bool assignStringsToBuckets(
+                const vector<hwlmLiteral> &lits,
+                TeddyEngineDescription &eng,
+                map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
+    assert(eng.numMasks <= MAX_NUM_MASKS);
+    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+        return false;
+    }
 
 #ifdef TEDDY_DEBUG
-    for (u32 i = 0; i < eng.numMasks * 2; i++) {
-        for (u32 j = 0; j < 16; j++) {
-            u8 val = baseMsk[i * 16 + j];
-            for (u32 k = 0; k < 8; k++) {
-                printf("%s", ((val >> k) & 0x1) ? "1" : "0");
-            }
-            printf(" ");
+    for (size_t i = 0; i < lits.size(); i++) {
+        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+               lits[i].nocase ? "caseless" : "caseful");
+        for (size_t j = 0; j < lits[i].s.size(); j++) {
+            printf("%02x", ((u32)lits[i].s[j])&0xff);
         }
         printf("\n");
     }
 #endif
 
-    return fdr;
+    if (!pack(lits, eng, bucketToLits)) {
+        DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+                     lits.size(), eng.getNumBuckets());
+        return false;
+    }
+    return true;
 }
 
 } // namespace
 
-bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                        bool make_small, u32 hint,
-                                        const target_t &target,
-                                        const Grey &grey) {
+bytecode_ptr<FDR> teddyBuildTable(const HWLMProto &proto, const Grey &grey) {
+    TeddyCompiler tc(proto.lits, proto.bucketToLits, *(proto.teddyEng),
+                     proto.make_small, grey);
+    return tc.build();
+}
+
+
+unique_ptr<HWLMProto> teddyBuildProtoHinted(
+                        u8 engType, const vector<hwlmLiteral> &lits,
+                        bool make_small, u32 hint, const target_t &target) {
     unique_ptr<TeddyEngineDescription> des;
     if (hint == HINT_INVALID) {
         des = chooseTeddyEngine(target, lits);
@@ -436,8 +578,14 @@ bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
     if (!des) {
         return nullptr;
     }
-    TeddyCompiler tc(lits, *des, make_small, grey);
-    return tc.build();
+
+    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
+    if (!assignStringsToBuckets(lits, *des, bucketToLits)) {
+        return nullptr;
+    }
+
+    return ue2::make_unique<HWLMProto>(engType, move(des), lits,
+                                       bucketToLits, make_small);
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy_compile.h b/src/fdr/teddy_compile.h
index 5ff4d839..a2b4a13c 100644
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -35,6 +35,7 @@
 #define TEDDY_COMPILE_H
 
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/bytecode_ptr.h"
 
 #include <vector>
@@ -43,15 +44,16 @@ struct FDR;
 
 namespace ue2 {
 
+class TeddyEngineDescription;
 struct Grey;
 struct hwlmLiteral;
 struct target_t;
 
-bytecode_ptr<FDR> teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits,
-                                        bool make_small, u32 hint,
-                                        const target_t &target,
-                                        const Grey &grey);
+bytecode_ptr<FDR> teddyBuildTable(const HWLMProto &proto, const Grey &grey);
 
+std::unique_ptr<HWLMProto> teddyBuildProtoHinted(
+                          u8 engType, const std::vector<hwlmLiteral> &lits,
+                          bool make_small, u32 hint, const target_t &target);
 } // namespace ue2
 
 #endif // TEDDY_COMPILE_H
diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp
index f7559b13..88ae0f53 100644
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
     return numMasks;
 }
 
-bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
-    if (packed || lits.size() > getNumBuckets()) {
-        return true;
-    }
-    for (const auto &lit : lits) {
-        if (lit.s.size() > numMasks || !lit.msk.empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
     static const TeddyEngineDef defns[] = {
         { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
diff --git a/src/fdr/teddy_engine_description.h b/src/fdr/teddy_engine_description.h
index 3979a5d3..95931613 100644
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@@ -55,7 +55,6 @@ public:
     explicit TeddyEngineDescription(const TeddyEngineDef &def);
 
     u32 getDefaultFloodSuffixLength() const override;
-    bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
 };
 
 std::unique_ptr<TeddyEngineDescription>
diff --git a/src/fdr/teddy_internal.h b/src/fdr/teddy_internal.h
index bbd8e788..1e9e603f 100644
--- a/src/fdr/teddy_internal.h
+++ b/src/fdr/teddy_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,6 +26,28 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/* Teddy bytecode layout:
+ * * |-----|
+ * * |     | struct Teddy
+ * * |-----|
+ * * |     | teddy masks
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 0..7
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 8..15 (FAT teddy)
+ * * |     |
+ * * |-----|
+ * * |     | confirm
+ * * |     |
+ * * |     |
+ * * |-----|
+ * * |     | flood control
+ * * |     |
+ * * |-----|
+ */
+
 #ifndef TEDDY_INTERNAL_H
 #define TEDDY_INTERNAL_H
 
@@ -36,11 +58,9 @@ struct Teddy {
     u32 engineID;
     u32 size;
     u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
     u32 floodOffset;
-    u32 link;
-    u32 pad1;
-    u32 pad2;
-    u32 pad3;
 };
 
 #endif
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index c5f0885f..1dbeb097 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -38,8 +38,12 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"
 
 extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(HAVE_AVX2)
+extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
+#endif
 
 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
@@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 }
 
 // Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
 static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
                      const u8 *buf_history, size_t len_history,
                      const u32 nMasks) {
     union {
@@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
     uintptr_t copy_start;
     uintptr_t copy_len;
 
-    if (ptr >= lo) {
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
         uintptr_t avail = (uintptr_t)(hi - ptr);
         if (avail >= 16) {
-            *p_mask = load128(p_mask_arr[16] + 16);
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
             return loadu128(ptr);
         }
-        *p_mask = load128(p_mask_arr[avail] + 16);
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
         copy_start = 0;
         copy_len = avail;
-    } else {
+    } else { // start zone
         uintptr_t need = MIN((uintptr_t)(lo - ptr),
                              MIN(len_history, nMasks - 1));
         uintptr_t start = (uintptr_t)(lo - ptr);
         uintptr_t i;
-        for (i = start - need; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
         }
         uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
-        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
-        copy_start = i;
-        copy_len = end - i;
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
     }
 
     // Runt block from the buffer.
@@ -152,6 +182,205 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
     return u.val128;
 }
 
+#if defined(HAVE_AVX2)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // HAVE_AVX2
+
+#if defined(HAVE_AVX512)
+// Note: p_mask is an output param that initialises a poison mask.
+//       u64a k = ones_u64a << n' >> m'; // m' < n'
+//       *p_mask = set_mask_m512(~k);
+//       means p_mask is consist of:
+//       (n' - m') poison bytes "0xff" at the beginning,
+//       followed by (64 - n') valid bytes "0x00",
+//       then followed by the rest m' poison bytes "0xff".
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,64)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=64)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
+                     const u32 nMasks) {
+    m512 val;
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 64) {
+            assert(start_offset - start <= 64);
+            u64a k = ones_u64a << (start_offset - start);
+            *p_mask = set_mask_m512(~k);
+            return loadu512(ptr);
+        }
+        assert(start_offset - start <= avail);
+        u64a k = ones_u64a << (64 - avail + start_offset - start)
+                           >> (64 - avail);
+        *p_mask = set_mask_m512(~k);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(hlen, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
+        val = loadu_maskz_m512(j, &hbuf[hlen - start]);
+        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
+        *p_mask = set_mask_m512(~k);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    assert(copy_len < 64);
+    assert(copy_len > 0);
+    u64a j = ones_u64a >> (64 - copy_len) << copy_start;
+    val = loadu_mask_m512(val, j, ptr);
+
+    return val;
+}
+#endif // HAVE_AVX512
+
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                 CautionReason reason) {
@@ -190,63 +419,27 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
         if (!(fdrc->groups & *control)) {
             continue;
         }
+        u64a tmp = 0;
         u64a confVal = getConfVal(a, ptr, byte, reason);
         confWithBit(fdrc, a, ptr - a->buf + byte, control,
-                    last_match, confVal);
+                    last_match, confVal, &tmp, 0);
     } while (unlikely(*conf));
 }
 
 static really_inline
-void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                           const u32 *confBase, CautionReason reason,
-                           const struct FDR_Runtime_Args *a, const u8 *ptr,
-                           hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx  = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
-                     confVal);
-    } while (unlikely(*conf));
+const m128 *getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
 static really_inline
-void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                              const u32 *confBase, CautionReason reason,
-                              const struct FDR_Runtime_Args *a, const u8 *ptr,
-                              hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
-                        last_match, confVal);
-    } while (unlikely(*conf));
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
 }
 
 static really_inline
-const m128 * getMaskBase(const struct Teddy *teddy) {
-    return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
-}
-
-static really_inline
-const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
-    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
-                         (numMask*32));
+const u32 *getConfBase(const struct Teddy *teddy) {
+    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
 }
 
 #endif /* TEDDY_RUNTIME_COMMON_H_ */
diff --git a/src/grey.cpp b/src/grey.cpp
index 24140c05..3762a497 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -139,6 +139,7 @@ Grey::Grey(void) :
                    limitSmallWriteOutfixSize(1048576), // 1 MB
                    smallWriteMaxPatterns(10000),
                    smallWriteMaxLiterals(10000),
+                   smallWriteMergeBatchSize(20),
                    allowTamarama(true), // Tamarama engine
                    tamaChunkSize(100),
                    dumpFlags(0),
@@ -302,6 +303,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(limitSmallWriteOutfixSize);
         G_UPDATE(smallWriteMaxPatterns);
         G_UPDATE(smallWriteMaxLiterals);
+        G_UPDATE(smallWriteMergeBatchSize);
         G_UPDATE(allowTamarama);
         G_UPDATE(tamaChunkSize);
         G_UPDATE(limitPatternCount);
diff --git a/src/grey.h b/src/grey.h
index 50519418..34c62918 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -157,6 +157,7 @@ struct Grey {
     u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
     u32 smallWriteMaxPatterns; // only try small writes if fewer patterns
     u32 smallWriteMaxLiterals; // only try small writes if fewer literals
+    u32 smallWriteMergeBatchSize; // number of DFAs to merge in a batch
 
     // Tamarama engine
     bool allowTamarama;
diff --git a/src/hs.cpp b/src/hs.cpp
index e3c1f811..c2143fe3 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -227,10 +227,10 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
     target_t target_info = platform ? target_t(*platform)
                                     : get_current_target();
 
-    CompileContext cc(isStreaming, isVectored, target_info, g);
-    NG ng(cc, elements, somPrecision);
-
     try {
+        CompileContext cc(isStreaming, isVectored, target_info, g);
+        NG ng(cc, elements, somPrecision);
+
         for (unsigned int i = 0; i < elements; i++) {
             // Add this expression to the compiler
             try {
@@ -262,7 +262,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
                                            e.hasIndex ? (int)e.index : -1);
         return HS_COMPILER_ERROR;
     }
-    catch (std::bad_alloc) {
+    catch (const std::bad_alloc &) {
         *db = nullptr;
         *comp_error = const_cast<hs_compile_error_t *>(&hs_enomem);
         return HS_COMPILER_ERROR;
@@ -399,7 +399,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         *error = generateCompileError(e);
         return HS_COMPILER_ERROR;
     }
-    catch (std::bad_alloc) {
+    catch (std::bad_alloc &) {
         *error = const_cast<hs_compile_error_t *>(&hs_enomem);
         return HS_COMPILER_ERROR;
     }
diff --git a/src/hs_common.h b/src/hs_common.h
index ffea397e..e1f079f2 100644
--- a/src/hs_common.h
+++ b/src/hs_common.h
@@ -561,6 +561,18 @@ hs_error_t HS_CDECL hs_valid_platform(void);
  */
 #define HS_ARCH_ERROR           (-11)
 
+/**
+ * Provided buffer was too small.
+ *
+ * This error indicates that there was insufficient space in the buffer. The
+ * call should be repeated with a larger provided buffer.
+ *
+ * Note: in this situation, it is normal for the amount of space required to be
+ * returned in the same manner as the used space would have been returned if the
+ * call was successful.
+ */
+#define HS_INSUFFICIENT_SPACE   (-12)
+
 /** @} */
 
 #ifdef __cplusplus
diff --git a/src/hs_runtime.h b/src/hs_runtime.h
index ecd97ca5..98e50068 100644
--- a/src/hs_runtime.h
+++ b/src/hs_runtime.h
@@ -321,6 +321,120 @@ hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
                                              match_event_handler onEvent,
                                              void *context);
 
+/**
+ * Creates a compressed representation of the provided stream in the buffer
+ * provided. This compressed representation can be converted back into a stream
+ * state by using @ref hs_expand_stream() or @ref hs_reset_and_expand_stream().
+ * The size of the compressed representation will be placed into @a used_space.
+ *
+ * If there is not sufficient space in the buffer to hold the compressed
+ * represention, @ref HS_INSUFFICIENT_SPACE will be returned and @a used_space
+ * will be populated with the amount of space required.
+ *
+ * Note: this function does not close the provided stream, you may continue to
+ * use the stream or to free it with @ref hs_close_stream().
+ *
+ * @param stream
+ *      The stream (as created by @ref hs_open_stream()) to be compressed.
+ *
+ * @param buf
+ *      Buffer to write the compressed representation into. Note: if the call is
+ *      just being used to determine the amount of space required, it is allowed
+ *      to pass NULL here and @a buf_space as 0.
+ *
+ * @param buf_space
+ *      The number of bytes in @a buf. If buf_space is too small, the call will
+ *      fail with @ref HS_INSUFFICIENT_SPACE.
+ *
+ * @param used_space
+ *      Pointer to where the amount of used space will be written to. The used
+ *      buffer space is always less than or equal to @a buf_space. If the call
+ *      fails with @ref HS_INSUFFICIENT_SPACE, this pointer will be used to
+ *      write out the amount of buffer space required.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_INSUFFICIENT_SPACE if the provided
+ *      buffer is too small.
+ */
+hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf,
+                                       size_t buf_space, size_t *used_space);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * into a new stream.
+ *
+ * Note: @a buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @a db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param db
+ *      The compiled pattern database that the compressed stream was opened
+ *      against.
+ *
+ * @param stream
+ *      On success, a pointer to the expanded @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db,
+                                     hs_stream_t **stream, const char *buf,
+                                     size_t buf_size);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * on top of the 'to' stream. The 'to' stream will first be reset (reporting
+ * any EOD matches if a non-NULL @a onEvent callback handler is provided).
+ *
+ * Note: the 'to' stream must be opened against the same database as the
+ * compressed stream.
+ *
+ * Note: @a buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @a db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param to_stream
+ *      A pointer to the generated @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @a onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream,
+                                               const char *buf, size_t buf_size,
+                                               hs_scratch_t *scratch,
+                                               match_event_handler onEvent,
+                                               void *context);
+
 /**
  * The block (non-streaming) regular expression scanner.
  *
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 6eaa7ed1..8cf585a9 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -170,7 +170,7 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
 }
 
 hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
-                      size_t start, HWLMCallback cb, void *ctxt,
+                      size_t start, HWLMCallback cb, struct hs_scratch *scratch,
                       hwlm_group_t groups) {
     assert(t);
 
@@ -184,25 +184,23 @@ hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
 
     if (t->type == HWLM_ENGINE_NOOD) {
         DEBUG_PRINTF("calling noodExec\n");
-        return noodExec(HWLM_C_DATA(t), buf + start, len - start, start, cb,
-                        ctxt);
-    } else {
-        assert(t->type == HWLM_ENGINE_FDR);
-        const union AccelAux *aa = &t->accel0;
-        if ((groups & ~t->accel1_groups) == 0) {
-            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
-            aa = &t->accel1;
-        }
-        do_accel_block(aa, buf, len, &start);
-        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
-                     start);
-        return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, ctxt, groups);
+        return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
     }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_block(aa, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, scratch, groups);
 }
 
-hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
-                               size_t len, size_t start, HWLMCallback cb,
-                               void *ctxt, hwlm_group_t groups) {
+hwlm_error_t hwlmExecStreaming(const struct HWLM *t, size_t len, size_t start,
+                               HWLMCallback cb, struct hs_scratch *scratch,
+                               hwlm_group_t groups) {
     assert(t);
     assert(scratch);
 
@@ -224,24 +222,21 @@ hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
         // If we've been handed a start offset, we can use a block mode scan at
         // that offset.
         if (start) {
-            return noodExec(HWLM_C_DATA(t), buf + start, len - start, start,
-                            cb, ctxt);
+            return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
         } else {
             return noodExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, cb,
-                                     ctxt, scratch->fdr_temp_buf,
-                                     FDR_TEMP_BUF_SIZE);
+                                     scratch);
         }
-    } else {
-        // t->type == HWLM_ENGINE_FDR
-        const union AccelAux *aa = &t->accel0;
-        if ((groups & ~t->accel1_groups) == 0) {
-            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
-            aa = &t->accel1;
-        }
-        do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
-        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
-                     start);
-        return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len,
-                                start, cb, ctxt, groups);
     }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, start, cb,
+                            scratch, groups);
 }
diff --git a/src/hwlm/hwlm.h b/src/hwlm/hwlm.h
index a17575df..224ecf6b 100644
--- a/src/hwlm/hwlm.h
+++ b/src/hwlm/hwlm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,14 +71,17 @@ typedef hwlm_group_t hwlmcb_rv_t;
  * designed for a different architecture). */
 #define HWLM_ERROR_UNKNOWN 2
 
+/** \brief Max length of the literal passed to HWLM. */
+#define HWLM_LITERAL_MAX_LEN 8
+
 struct hs_scratch;
 struct HWLM;
 
 /** \brief The type for an HWLM callback.
  *
- * This callback receives a start-of-match offset, an end-of-match offset, the
- * ID of the match and the context pointer that was passed into \ref
- * hwlmExec or \ref hwlmExecStreaming.
+ * This callback receives an end-of-match offset, the ID of the match and
+ * the context pointer that was passed into \ref hwlmExec or
+ * \ref hwlmExecStreaming.
  *
  * A callback return of \ref HWLM_TERMINATE_MATCHING will stop matching.
  *
@@ -92,8 +95,8 @@ struct HWLM;
  * belonging to the literal which was active at the when the end match location
  * was first reached.
  */
-typedef hwlmcb_rv_t (*HWLMCallback)(size_t start, size_t end, u32 id,
-                                    void *context);
+typedef hwlmcb_rv_t (*HWLMCallback)(size_t end, u32 id,
+                     struct hs_scratch *scratch);
 
 /** \brief Match strings in table.
  *
@@ -104,24 +107,26 @@ typedef hwlmcb_rv_t (*HWLMCallback)(size_t start, size_t end, u32 id,
  * Returns \ref HWLM_TERMINATED if scanning is cancelled due to the callback
  * returning \ref HWLM_TERMINATE_MATCHING.
  *
- * \p start is the first offset at which a match may start.
+ * \p start is the first offset at which a match may start. Note: match
+ * starts may include masks overhanging the main literal.
  *
  * The underlying engine may choose not to report any match which starts before
  * the first possible match of a literal which is in the initial group mask.
  */
 hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
-                      size_t start, HWLMCallback callback, void *context,
-                      hwlm_group_t groups);
+                      size_t start, HWLMCallback callback,
+                      struct hs_scratch *scratch, hwlm_group_t groups);
 
 /** \brief As for \ref hwlmExec, but a streaming case across two buffers.
- *
- * \p scratch is used to access fdr_temp_buf and to access the history buffer,
- * history length and the main buffer.
  *
  * \p len is the length of the main buffer to be scanned.
  *
  * \p start is an advisory hint representing the first offset at which a match
- * may start. Some underlying literal matches may not respect it.
+ * may start. Some underlying literal matches may not respect it. Note: match
+ * starts may include masks overhanging the main literal.
+ *
+ * \p scratch is used to access the history buffer, history length and
+ * the main buffer.
  *
  * Two buffers/lengths are provided. Matches that occur entirely within
  * the history buffer will not be reported by this function. The offsets
@@ -129,10 +134,9 @@ hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
  * match at byte 10 of the main buffer is reported as 10). Matches that start
  * in the history buffer will have starts reported with 'negative' values.
  */
-hwlm_error_t hwlmExecStreaming(const struct HWLM *tab,
-                               struct hs_scratch *scratch, size_t len,
-                               size_t start, HWLMCallback callback,
-                               void *context, hwlm_group_t groups);
+hwlm_error_t hwlmExecStreaming(const struct HWLM *tab, size_t len, size_t start,
+                               HWLMCallback callback,
+                               struct hs_scratch *scratch, hwlm_group_t groups);
 
 #ifdef __cplusplus
 }       /* extern "C" */
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index 2f61ea6d..1b332815 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -41,8 +41,12 @@
 #include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
+#include "fdr/fdr_compile_internal.h"
+#include "fdr/fdr_engine_description.h"
+#include "fdr/teddy_engine_description.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
+#include "util/make_unique.h"
 #include "util/ue2string.h"
 
 #include <cassert>
@@ -53,6 +57,28 @@ using namespace std;
 
 namespace ue2 {
 
+HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in)
+    : engType(engType_in), lits(move(lits_in)) {}
+
+HWLMProto::HWLMProto(u8 engType_in,
+                     unique_ptr<FDREngineDescription> eng_in,
+                     vector<hwlmLiteral> lits_in,
+                     map<u32, vector<u32>> bucketToLits_in,
+                     bool make_small_in)
+    : engType(engType_in), fdrEng(move(eng_in)), lits(move(lits_in)),
+      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+
+HWLMProto::HWLMProto(u8 engType_in,
+                     unique_ptr<TeddyEngineDescription> eng_in,
+                     vector<hwlmLiteral> lits_in,
+                     map<u32, vector<u32>> bucketToLits_in,
+                     bool make_small_in)
+    : engType(engType_in), teddyEng(move(eng_in)),
+      lits(move(lits_in)),
+      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+
+HWLMProto::~HWLMProto() {}
+
 static
 void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
 #ifdef DEBUG
@@ -89,17 +115,55 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
         return false;
     }
 
-    if (!lits.front().msk.empty()) {
-        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
-        return false;
-    }
-
     return true;
 }
 
-bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
-                             const CompileContext &cc,
+bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
                              UNUSED hwlm_group_t expected_groups) {
+    size_t engSize = 0;
+    shared_ptr<void> eng;
+
+    const auto &lits = proto.lits;
+    DEBUG_PRINTF("building table with %zu strings\n", lits.size());
+
+    if (proto.engType == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("build noodle table\n");
+        const hwlmLiteral &lit = lits.front();
+        auto noodle = noodBuildTable(lit);
+        if (noodle) {
+            engSize = noodle.size();
+        }
+        eng = move(noodle);
+    } else {
+        DEBUG_PRINTF("building a new deal\n");
+        auto fdr = fdrBuildTable(proto, cc.grey);
+        if (fdr) {
+            engSize = fdr.size();
+        }
+        eng = move(fdr);
+    }
+
+    if (!eng) {
+        return nullptr;
+    }
+
+    assert(engSize);
+    if (engSize > cc.grey.limitLiteralMatcherSize) {
+        throw ResourceLimitError();
+    }
+
+    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
+    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
+
+    h->type = proto.engType;
+    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
+
+    return h;
+}
+
+unique_ptr<HWLMProto>
+hwlmBuildProto(vector<hwlmLiteral> &lits, bool make_small,
+               const CompileContext &cc) {
     assert(!lits.empty());
     dumpLits(lits);
 
@@ -129,9 +193,7 @@ bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
         }
     }
 
-    u8 engType = 0;
-    size_t engSize = 0;
-    shared_ptr<void> eng;
+    unique_ptr<HWLMProto> proto;
 
     DEBUG_PRINTF("building table with %zu strings\n", lits.size());
 
@@ -139,39 +201,17 @@ bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
 
     if (isNoodleable(lits, cc)) {
         DEBUG_PRINTF("build noodle table\n");
-        engType = HWLM_ENGINE_NOOD;
-        const hwlmLiteral &lit = lits.front();
-        auto noodle = noodBuildTable(lit);
-        if (noodle) {
-            engSize = noodle.size();
-        }
-        eng = move(noodle);
+        proto = ue2::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
     } else {
         DEBUG_PRINTF("building a new deal\n");
-        engType = HWLM_ENGINE_FDR;
-        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey);
-        if (fdr) {
-            engSize = fdr.size();
+        proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small,
+                              cc.target_info, cc.grey);
+        if (!proto) {
+            return nullptr;
         }
-        eng = move(fdr);
     }
 
-    if (!eng) {
-        return nullptr;
-    }
-
-    assert(engSize);
-    if (engSize > cc.grey.limitLiteralMatcherSize) {
-        throw ResourceLimitError();
-    }
-
-    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
-    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
-
-    h->type = engType;
-    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
-
-    return h;
+    return proto;
 }
 
 size_t hwlmSize(const HWLM *h) {
diff --git a/src/hwlm/hwlm_build.h b/src/hwlm/hwlm_build.h
index f2691496..91f227dc 100644
--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@@ -34,9 +34,11 @@
 #define HWLM_BUILD_H
 
 #include "hwlm.h"
+#include "hwlm_literal.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
 
+#include <map>
 #include <memory>
 #include <vector>
 
@@ -44,15 +46,62 @@ struct HWLM;
 
 namespace ue2 {
 
+class FDREngineDescription;
+class TeddyEngineDescription;
 struct CompileContext;
 struct Grey;
-struct hwlmLiteral;
+
+/** \brief Class representing a literal matcher prototype. */
+struct HWLMProto {
+    /**
+     * \brief Engine type to distinguish noodle from FDR and Teddy.
+     */
+    u8 engType;
+
+    /**
+     * \brief FDR engine description.
+     */
+    std::unique_ptr<FDREngineDescription> fdrEng;
+
+    /**
+     * \brief Teddy engine description.
+     */
+    std::unique_ptr<TeddyEngineDescription> teddyEng;
+
+     /**
+      * \brief HWLM literals passed from Rose.
+      */
+    std::vector<hwlmLiteral> lits;
+
+    /**
+     * \brief Bucket assignment info in FDR and Teddy
+     */
+    std::map<u32, std::vector<u32>> bucketToLits;
+
+    /**
+     * \brief Flag to optimise matcher for small size from Rose.
+     */
+    bool make_small = false;
+
+    HWLMProto(u8 engType_in, std::vector<hwlmLiteral> lits_in);
+
+    HWLMProto(u8 engType_in, std::unique_ptr<FDREngineDescription> eng_in,
+              std::vector<hwlmLiteral> lits_in,
+              std::map<u32, std::vector<u32>> bucketToLits_in,
+              bool make_small_in);
+
+    HWLMProto(u8 engType_in, std::unique_ptr<TeddyEngineDescription> eng_in,
+              std::vector<hwlmLiteral> lits_in,
+              std::map<u32, std::vector<u32>> bucketToLits_in,
+              bool make_small_in);
+
+    ~HWLMProto();
+};
 
 /** \brief Build an \ref HWLM literal matcher runtime structure for a group of
  * literals.
  *
- * \param lits The group of literals.
- * \param make_small Optimise matcher for small size.
+ * \param proto Literal matcher prototype.
  * \param cc Compile context.
  * \param expected_groups FIXME: document me!
  *
@@ -60,10 +109,13 @@ struct hwlmLiteral;
  * may result in a nullptr return value, or a std::bad_alloc exception being
  * thrown.
  */
-bytecode_ptr<HWLM> hwlmBuild(const std::vector<hwlmLiteral> &lits,
-                             bool make_small, const CompileContext &cc,
+bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
                              hwlm_group_t expected_groups = HWLM_ALL_GROUPS);
 
+std::unique_ptr<HWLMProto>
+hwlmBuildProto(std::vector<hwlmLiteral> &lits, bool make_small,
+               const CompileContext &cc);
+
 /**
  * Returns an estimate of the number of repeated characters on the end of a
  * literal that will make a literal set of size \a numLiterals suffer
diff --git a/src/hwlm/hwlm_dump.cpp b/src/hwlm/hwlm_dump.cpp
index 58411ab2..59353eee 100644
--- a/src/hwlm/hwlm_dump.cpp
+++ b/src/hwlm/hwlm_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,16 +38,19 @@
 #include "ue2common.h"
 #include "fdr/fdr_dump.h"
 #include "nfa/accel_dump.h"
-
-#include <cstdio>
+#include "util/dump_util.h"
 
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
 
+using namespace std;
+
 namespace ue2 {
 
-void hwlmPrintStats(const HWLM *h, FILE *f) {
+void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {
+    StdioFile f(base + ".txt", "w");
+
     switch (h->type) {
     case HWLM_ENGINE_NOOD:
         noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
diff --git a/src/hwlm/hwlm_dump.h b/src/hwlm/hwlm_dump.h
index e7e38353..12f61c86 100644
--- a/src/hwlm/hwlm_dump.h
+++ b/src/hwlm/hwlm_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,16 +35,16 @@
 
 #ifdef DUMP_SUPPORT
 
-#include <cstdio>
+#include <string>
 
 struct HWLM;
 
 namespace ue2 {
 
 /** \brief Dump some information about the give HWLM structure. */
-void hwlmPrintStats(const HWLM *h, FILE *f);
+void hwlmGenerateDumpFiles(const HWLM *h, const std::string &base);
 
 } // namespace ue2
 
-#endif
-#endif
+#endif // DUMP_SUPPORT
+#endif // HWLM_DUMP_H
diff --git a/src/hwlm/hwlm_literal.h b/src/hwlm/hwlm_literal.h
index 0e2a1ea5..08510fb0 100644
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@@ -42,12 +42,11 @@
 
 namespace ue2 {
 
-/** \brief Max length of the literal passed to HWLM. */
-#define HWLM_LITERAL_MAX_LEN 8
-
 /** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
 #define HWLM_MASKLEN 8
 
+#define INVALID_LIT_ID ~0U
+
 /** \brief Class representing a literal, fed to \ref hwlmBuild. */
 struct hwlmLiteral {
     std::string s; //!< \brief The literal itself.
@@ -67,6 +66,21 @@ struct hwlmLiteral {
      * can be quashed by the literal matcher. */
     bool noruns;
 
+    /** \brief included literal id. */
+    u32 included_id = INVALID_LIT_ID;
+
+    /** \brief Squash mask for FDR's confirm mask for included literals.
+     *
+     * In FDR confirm, if we have included literal in another bucket,
+     * we can use this mask to squash the bit for the bucket in FDR confirm
+     * mask and then run programs of included literal directly and avoid
+     * confirm work.
+     *
+     * This value is calculated in FDR compile code once bucket assignment is
+     * completed
+     */
+    u8 squash = 0;
+
     /** \brief Set of groups that literal belongs to.
      *
      * Use \ref HWLM_ALL_GROUPS for a literal that could match regardless of
diff --git a/src/hwlm/noodle_build.cpp b/src/hwlm/noodle_build.cpp
index 63fdf072..a0128d0a 100644
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@@ -35,14 +35,33 @@
 
 #include "hwlm_literal.h"
 #include "noodle_internal.h"
+#include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/verify_types.h"
 #include "ue2common.h"
 
 #include <cstring> // for memcpy
+#include <vector>
+
+using std::vector;
 
 namespace ue2 {
 
+static
+u64a make_u64a_mask(const vector<u8> &v) {
+    assert(v.size() <= sizeof(u64a));
+    if (v.size() > sizeof(u64a)) {
+        throw std::exception();
+    }
+
+    u64a mask = 0;
+    size_t len = v.size();
+    unsigned char *m = (unsigned char *)&mask;
+    DEBUG_PRINTF("making mask len %zu\n", len);
+    memcpy(m, &v[0], len);
+    return mask;
+}
+
 static
 size_t findNoodFragOffset(const hwlmLiteral &lit) {
     const auto &s = lit.s;
@@ -67,30 +86,59 @@ size_t findNoodFragOffset(const hwlmLiteral &lit) {
 }
 
 bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
-    if (!lit.msk.empty()) {
-        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
-        return nullptr;
+    const auto &s = lit.s;
+
+    size_t mask_len = std::max(s.length(), lit.msk.size());
+    DEBUG_PRINTF("mask is %zu bytes\n", lit.msk.size());
+    assert(mask_len <= 8);
+    assert(lit.msk.size() == lit.cmp.size());
+
+    vector<u8> n_msk(mask_len);
+    vector<u8> n_cmp(mask_len);
+
+    for (unsigned i = mask_len - lit.msk.size(), j = 0; i < mask_len;
+         i++, j++) {
+        DEBUG_PRINTF("m[%u] %hhx c[%u] %hhx\n", i, lit.msk[j], i, lit.cmp[j]);
+        n_msk[i] = lit.msk[j];
+        n_cmp[i] = lit.cmp[j];
     }
 
-    const auto &s = lit.s;
-    size_t noodle_len = sizeof(noodTable) + s.length();
-    auto n = make_zeroed_bytecode_ptr<noodTable>(noodle_len);
+    size_t s_off = mask_len - s.length();
+    for (unsigned i = s_off; i < mask_len; i++) {
+        u8 c = s[i - s_off];
+        u8 si_msk = lit.nocase && ourisalpha(c) ? (u8)CASE_CLEAR : (u8)0xff;
+        n_msk[i] |= si_msk;
+        n_cmp[i] |= c & si_msk;
+        assert((n_cmp[i] & si_msk) == c);
+        DEBUG_PRINTF("m[%u] %hhx c[%u] %hhx '%c'\n", i, n_msk[i], i, n_cmp[i],
+                     ourisprint(c) ? (char)c : '.');
+    }
+
+    auto n = make_zeroed_bytecode_ptr<noodTable>(sizeof(noodTable));
     assert(n);
+    DEBUG_PRINTF("size of nood %zu\n", sizeof(noodTable));
 
     size_t key_offset = findNoodFragOffset(lit);
 
     n->id = lit.id;
-    n->len = verify_u32(s.length());
-    n->key_offset = verify_u32(key_offset);
+    n->single = s.length() == 1 ? 1 : 0;
+    n->key_offset = verify_u8(s.length() - key_offset);
     n->nocase = lit.nocase ? 1 : 0;
-    memcpy(n->str, s.c_str(), s.length());
+    n->key0 = s[key_offset];
+    if (n->single) {
+        n->key1 = 0;
+    } else {
+        n->key1 = s[key_offset + 1];
+    }
+    n->msk = make_u64a_mask(n_msk);
+    n->cmp = make_u64a_mask(n_cmp);
+    n->msk_len = mask_len;
 
     return n;
 }
 
-size_t noodSize(const noodTable *n) {
-    assert(n); // shouldn't call with null
-    return sizeof(*n) + n->len;
+size_t noodSize(const noodTable *) {
+    return sizeof(noodTable);
 }
 
 } // namespace ue2
@@ -102,13 +150,17 @@ namespace ue2 {
 
 void noodPrintStats(const noodTable *n, FILE *f) {
     fprintf(f, "Noodle table\n");
-    fprintf(f, "Len: %u Key Offset: %u\n", n->len, n->key_offset);
+    fprintf(f, "Key Offset: %u\n", n->key_offset);
+    fprintf(f, "Msk: %llx Cmp: %llx MskLen %u\n",
+            n->msk >> 8 * (8 - n->msk_len), n->cmp >> 8 * (8 - n->msk_len),
+            n->msk_len);
     fprintf(f, "String: ");
-    for (u32 i = 0; i < n->len; i++) {
-        if (isgraph(n->str[i]) && n->str[i] != '\\') {
-            fprintf(f, "%c", n->str[i]);
+    for (u32 i = 0; i < n->msk_len; i++) {
+        const u8 *m = (const u8 *)&n->cmp;
+        if (isgraph(m[i]) && m[i] != '\\') {
+            fprintf(f, "%c", m[i]);
         } else {
-            fprintf(f, "\\x%02hhx", n->str[i]);
+            fprintf(f, "\\x%02hhx", m[i]);
         }
     }
     fprintf(f, "\n");
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
index 9758f42b..d4f6902a 100644
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -32,6 +32,7 @@
 #include "hwlm.h"
 #include "noodle_engine.h"
 #include "noodle_internal.h"
+#include "scratch.h"
 #include "ue2common.h"
 #include "util/arch.h"
 #include "util/bitutils.h"
@@ -39,6 +40,7 @@
 #include "util/intrinsics.h"
 #include "util/join.h"
 #include "util/masked_move.h"
+#include "util/partial_store.h"
 #include "util/simd_utils.h"
 
 #include <ctype.h>
@@ -49,7 +51,7 @@
 struct cb_info {
     HWLMCallback cb; //!< callback function called on match
     u32 id; //!< ID to pass to callback on match
-    void *ctx; //!< caller-supplied context to pass to callback
+    struct hs_scratch *scratch; //!< scratch to pass to callback
     size_t offsetAdj; //!< used in streaming mode
 };
 
@@ -83,9 +85,8 @@ struct cb_info {
         while (unlikely(z)) {                                                  \
             Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
             size_t matchPos = d - buf + pos;                                   \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
-            hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi,        \
-                                   matchPos);                                  \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);             \
             RETURN_IF_TERMINATED(rv);                                          \
         }                                                                      \
     } while (0)
@@ -95,9 +96,8 @@ struct cb_info {
         while (unlikely(z)) {                                                  \
             Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
             size_t matchPos = d - buf + pos - 1;                               \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
-            hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1,        \
-                                   noCase, cbi, matchPos);                     \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);             \
             RETURN_IF_TERMINATED(rv);                                          \
         }                                                                      \
     } while (0)
@@ -111,21 +111,26 @@ u8 caseClear8(u8 x, bool noCase) {
 // is used only for single chars with case insensitivity used correctly,
 // so it can go straight to the callback if we get this far.
 static really_inline
-hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                   size_t keyOffset, bool is_double, bool noCase,
-                   const struct cb_info *cbi, size_t pos) {
-    pos -= keyOffset;
-    if (is_double) {
-        if (pos + keyLen > len) {
-            return HWLM_SUCCESS;
-        }
-        if (cmpForward(buf + pos, key, keyLen, noCase)) { // ret 1 on mismatch
-            return HWLM_SUCCESS;
+hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
+                   char single, const struct cb_info *cbi, size_t pos) {
+    if (single) {
+        if (n->msk_len == 1) {
+            goto match;
         }
     }
-    pos += cbi->offsetAdj;
-    DEBUG_PRINTF("match @ %zu->%zu\n", pos, (pos + keyLen - 1));
-    hwlmcb_rv_t rv = cbi->cb(pos, (pos + keyLen - 1), cbi->id, cbi->ctx);
+    assert(len >= n->msk_len);
+    u64a v =
+        partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
+    DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
+    if ((v & n->msk) != n->cmp) {
+        /* mask didn't match */
+        return HWLM_SUCCESS;
+    }
+
+match:
+    pos -= cbi->offsetAdj;
+    DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
+    hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return HWLM_TERMINATED;
     }
@@ -147,38 +152,43 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
 #endif
 
 static really_inline
-hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, const struct cb_info *cbi) {
+hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
+                            const struct cb_info *cbi) {
 
-    const MASK_TYPE mask1 = getMask(key[0], noCase);
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
     const MASK_TYPE caseMask = getCaseMask();
 
+    size_t offset = start + n->msk_len - 1;
+    size_t end = len;
+    assert(offset < end);
+
 #if !defined(HAVE_AVX512)
     hwlm_error_t rv;
-    size_t end = len;
 
-    if (len < CHUNKSIZE) {
-        rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len);
+    if (end - offset < CHUNKSIZE) {
+        rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                             end);
         return rv;
     }
 
-    if (len == CHUNKSIZE) {
-        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
-                                 0, len);
+    if (end - offset == CHUNKSIZE) {
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, end);
         return rv;
     }
 
     uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data, CHUNKSIZE) - data;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
     uintptr_t last = data + end;
     uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
-    uintptr_t s3Start = len - CHUNKSIZE;
+    uintptr_t s3Start = end - CHUNKSIZE;
 
-    if (s2Start) {
+    if (offset != s2Start) {
         // first scan out to the fast scan starting point
         DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
-                                 0, s2Start);
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, s2Start);
         RETURN_IF_TERMINATED(rv);
     }
 
@@ -186,68 +196,70 @@ hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
         // scan as far as we can, bounded by the last point this key can
         // possibly match
         DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
-        rv = scanSingleFast(buf, len, key, noCase, caseMask, mask1, cbi,
-                            s2Start, s2End);
+        rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
+                            s2End);
         RETURN_IF_TERMINATED(rv);
     }
 
     // if we are done bail out
-    if (s2End == end) {
+    if (s2End == len) {
         return HWLM_SUCCESS;
     }
 
-    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, end);
-    rv = scanSingleUnaligned(buf, len, s3Start, key, noCase, caseMask, mask1,
-                             cbi, s2End, end);
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
+    rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
+                             s2End, len);
 
     return rv;
 #else // HAVE_AVX512
-    return scanSingle512(buf, len, key, noCase, caseMask, mask1, cbi);
+    return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                         end);
 #endif
 }
 
 static really_inline
-hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
+hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
                             const struct cb_info *cbi) {
     // we stop scanning for the key-fragment when the rest of the key can't
     // possibly fit in the remaining buffer
-    size_t end = len - keyLen + keyOffset + 2;
+    size_t end = len - n->key_offset + 2;
+
+    // the first place the key can match
+    size_t offset = start + n->msk_len - n->key_offset;
 
     const MASK_TYPE caseMask = getCaseMask();
-    const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase);
-    const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase);
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
+    const MASK_TYPE mask2 = getMask(n->key1, noCase);
 
 #if !defined(HAVE_AVX512)
     hwlm_error_t rv;
 
-    if (end - keyOffset < CHUNKSIZE) {
-        rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                             mask1, mask2, cbi, keyOffset, end);
+    if (end - offset < CHUNKSIZE) {
+        rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                             offset, end);
         return rv;
     }
-    if (end - keyOffset == CHUNKSIZE) {
-        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
-                                 noCase, caseMask, mask1, mask2, cbi, keyOffset,
-                                 end);
+    if (end - offset == CHUNKSIZE) {
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, offset, end);
         return rv;
     }
 
     uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + keyOffset, CHUNKSIZE) - data;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
     uintptr_t s1End = s2Start + 1;
     uintptr_t last = data + end;
     uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
     uintptr_t s3Start = end - CHUNKSIZE;
-    uintptr_t off = keyOffset;
+    uintptr_t off = offset;
 
-    if (s2Start != keyOffset) {
+    if (s2Start != off) {
         // first scan out to the fast scan starting point plus one char past to
         // catch the key on the overlap
-        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
-                                 noCase, caseMask, mask1, mask2, cbi, off,
-                                 s1End);
+        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, off, s1End);
         RETURN_IF_TERMINATED(rv);
     }
     off = s1End;
@@ -261,8 +273,8 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
         // scan as far as we can, bounded by the last point this key can
         // possibly match
         DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
-        rv = scanDoubleFast(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                            mask1, mask2, cbi, s2Start, s2End);
+        rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                            s2Start, s2End);
         RETURN_IF_TERMINATED(rv);
         off = s2End;
     }
@@ -273,130 +285,158 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
     }
 
     DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
-    rv = scanDoubleUnaligned(buf, len, s3Start, key, keyLen, keyOffset, noCase,
-                             caseMask, mask1, mask2, cbi, off, end);
+    rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
+                             mask2, cbi, off, end);
 
     return rv;
 #else // AVX512
-    return scanDouble512(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                         mask1, mask2, cbi, keyOffset, end);
+    return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                         offset, end);
 #endif // AVX512
 }
 
 
 static really_inline
-hwlm_error_t scanSingleNoCase(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
                               const struct cb_info *cbi) {
-    return scanSingleMain(buf, len, key, 1, cbi);
+    return scanSingleMain(n, buf, len, start, 1, cbi);
 }
 
 static really_inline
-hwlm_error_t scanSingleCase(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
                             const struct cb_info *cbi) {
-    return scanSingleMain(buf, len, key, 0, cbi);
+    return scanSingleMain(n, buf, len, start, 0, cbi);
 }
 
 // Single-character specialisation, used when keyLen = 1
 static really_inline
-hwlm_error_t scanSingle(const u8 *buf, size_t len, const u8 *key, bool noCase,
-                        const struct cb_info *cbi) {
-    if (!ourisalpha(key[0])) {
+hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+    if (!ourisalpha(n->key0)) {
         noCase = 0; // force noCase off if we don't have an alphabetic char
     }
 
     // kinda ugly, but this forces constant propagation
     if (noCase) {
-        return scanSingleNoCase(buf, len, key, cbi);
+        return scanSingleNoCase(n, buf, len, start, cbi);
     } else {
-        return scanSingleCase(buf, len, key, cbi);
+        return scanSingleCase(n, buf, len, start, cbi);
     }
 }
 
 
 static really_inline
-hwlm_error_t scanDoubleNoCase(const u8 *buf, size_t len, const u8 *key,
-                              size_t keyLen, size_t keyOffset,
+hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
                               const struct cb_info *cbi) {
-    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 1, cbi);
+    return scanDoubleMain(n, buf, len, start, 1, cbi);
 }
 
 static really_inline
-hwlm_error_t scanDoubleCase(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset,
+hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
                             const struct cb_info *cbi) {
-    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 0, cbi);
+    return scanDoubleMain(n, buf, len, start, 0, cbi);
 }
 
 
 static really_inline
-hwlm_error_t scanDouble(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                        size_t keyOffset, bool noCase,
-                        const struct cb_info *cbi) {
+hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
     // kinda ugly, but this forces constant propagation
     if (noCase) {
-        return scanDoubleNoCase(buf, len, key, keyLen, keyOffset, cbi);
+        return scanDoubleNoCase(n, buf, len, start, cbi);
     } else {
-        return scanDoubleCase(buf, len, key, keyLen, keyOffset, cbi);
+        return scanDoubleCase(n, buf, len, start, cbi);
     }
 }
 
 // main entry point for the scan code
 static really_inline
-hwlm_error_t scan(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                  size_t keyOffset, bool noCase, const struct cb_info *cbi) {
-    if (len < keyLen) {
+hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
+                  size_t start, char single, bool noCase,
+                  const struct cb_info *cbi) {
+    if (len - start < n->msk_len) {
         // can't find string of length keyLen in a shorter buffer
         return HWLM_SUCCESS;
     }
 
-    if (keyLen == 1) {
-        assert(keyOffset == 0);
-        return scanSingle(buf, len, key, noCase, cbi);
+    if (single) {
+        return scanSingle(n, buf, len, start, noCase, cbi);
     } else {
-        return scanDouble(buf, len, key, keyLen, keyOffset, noCase, cbi);
+        return scanDouble(n, buf, len, start, noCase, cbi);
     }
 }
 
 /** \brief Block-mode scanner. */
 hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
-                      size_t offset_adj, HWLMCallback cb, void *ctxt) {
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch) {
     assert(n && buf);
 
-    struct cb_info cbi = { cb, n->id, ctxt, offset_adj };
-    DEBUG_PRINTF("nood scan of %zu bytes for %*s\n", len, n->len, n->str);
-    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
+                 (const char *)&n->cmp, buf);
+
+    return scan(n, buf, len, start, n->single, n->nocase, &cbi);
 }
 
 /** \brief Streaming-mode scanner. */
 hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
                                size_t hlen, const u8 *buf, size_t len,
-                               HWLMCallback cb, void *ctxt, u8 *temp_buf,
-                               UNUSED size_t temp_buffer_size) {
+                               HWLMCallback cb, struct hs_scratch *scratch) {
     assert(n);
 
-    struct cb_info cbi = {cb, n->id, ctxt, 0};
-    hwlm_error_t rv;
+    if (len + hlen < n->msk_len) {
+        DEBUG_PRINTF("not enough bytes for a match\n");
+        return HWLM_SUCCESS;
+    }
 
-    if (hlen) {
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
+                 n->msk_len, (const char *)&n->cmp, buf);
+
+    if (hlen && n->msk_len > 1) {
+        /*
+         * we have history, so build up a buffer from enough of the history
+         * buffer plus what we've been given to scan. Since this is relatively
+         * short, just check against msk+cmp per byte offset for matches.
+         */
         assert(hbuf);
+        u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
+        memset(temp_buf, 0, sizeof(temp_buf));
 
-        size_t tl1 = MIN(n->len - 1, hlen);
-        size_t tl2 = MIN(n->len - 1, len);
-        size_t temp_len = tl1 + tl2;
-        assert(temp_len < temp_buffer_size);
-        memcpy(temp_buf, hbuf + hlen - tl1, tl1);
-        memcpy(temp_buf + tl1, buf, tl2);
+        assert(n->msk_len);
+        size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
+        size_t tl2 = MIN((size_t)n->msk_len - 1, len);
 
-        cbi.offsetAdj = -tl1;
-        rv = scan(temp_buf, temp_len, n->str, n->len, n->key_offset, n->nocase,
-                  &cbi);
-        if (rv == HWLM_TERMINATED) {
-            return HWLM_TERMINATED;
+        assert(tl1 + tl2 <= sizeof(temp_buf));
+        assert(tl1 + tl2 >= n->msk_len);
+        assert(tl1 <= sizeof(u64a));
+        assert(tl2 <= sizeof(u64a));
+        DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
+
+        unaligned_store_u64a(temp_buf,
+                             partial_load_u64a(hbuf + hlen - tl1, tl1));
+        unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
+
+        for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
+            u64a v = unaligned_load_u64a(temp_buf + i);
+            if ((v & n->msk) == n->cmp) {
+                size_t m_end = -tl1 + i + n->msk_len - 1;
+                DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
+                hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
+                if (rv == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATED;
+                }
+            }
         }
     }
 
     assert(buf);
 
     cbi.offsetAdj = 0;
-    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+    return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
 }
diff --git a/src/hwlm/noodle_engine.h b/src/hwlm/noodle_engine.h
index e044a863..64422c41 100644
--- a/src/hwlm/noodle_engine.h
+++ b/src/hwlm/noodle_engine.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,16 +41,17 @@ extern "C"
 #endif
 
 struct noodTable;
+struct hs_scratch;
 
 /** \brief Block-mode scanner. */
 hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
-                      size_t offset_adj, HWLMCallback cb, void *ctxt);
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch);
 
 /** \brief Streaming-mode scanner. */
 hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
                                size_t hlen, const u8 *buf, size_t len,
-                               HWLMCallback cb, void *ctxt, u8 *temp_buf,
-                               size_t temp_buffer_size);
+                               HWLMCallback cb, struct hs_scratch *scratch);
 
 #ifdef __cplusplus
 }       /* extern "C" */
diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
index a3f46047..f10e4a7b 100644
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -38,10 +38,11 @@ static really_inline m256 getCaseMask(void) {
 }
 
 static really_inline
-hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, bool noCase, m256 caseMask,
-                                 m256 mask1, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
     const u8 *d = buf + offset;
     DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
     const size_t l = end - start;
@@ -66,11 +67,11 @@ hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
 }
 
 static really_inline
-hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, size_t keyLen, size_t keyOffset,
-                                 bool noCase, m256 caseMask, m256 mask1,
-                                 m256 mask2, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1, m256 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
     const u8 *d = buf + offset;
     DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
     size_t l = end - start;
@@ -100,8 +101,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
 // alignment boundary if needed and to finish off data that the aligned scan
 // function can't handle (due to small/unaligned chunk at end)
 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m256 caseMask, m256 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
                              const struct cb_info *cbi, size_t start,
                              size_t end) {
     const u8 *d = buf + start;
@@ -140,11 +141,10 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m256 caseMask, m256 mask1, m256 mask2,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
+                             m256 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
     const u8 *d = buf + start;
     size_t l = end - start;
     if (!l) {
@@ -182,8 +182,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, m256 caseMask, m256 mask1,
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
                             const struct cb_info *cbi, size_t start,
                             size_t end) {
     const u8 *d = buf + start, *e = buf + end;
@@ -203,10 +203,9 @@ hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
-                            m256 caseMask, m256 mask1, m256 mask2,
-                            const struct cb_info *cbi, size_t start,
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
+                            m256 mask2, const struct cb_info *cbi, size_t start,
                             size_t end) {
     const u8 *d = buf + start, *e = buf + end;
     DEBUG_PRINTF("start %zu end %zu \n", start, end);
diff --git a/src/hwlm/noodle_engine_avx512.c b/src/hwlm/noodle_engine_avx512.c
index d4e6527f..8cac1b15 100644
--- a/src/hwlm/noodle_engine_avx512.c
+++ b/src/hwlm/noodle_engine_avx512.c
@@ -43,8 +43,8 @@ m512 getCaseMask(void) {
 // alignment boundary if needed and to finish off data that the aligned scan
 // function can't handle (due to small/unaligned chunk at end)
 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m512 caseMask, m512 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
                              const struct cb_info *cbi, size_t start,
                              size_t end) {
     const u8 *d = buf + start;
@@ -73,11 +73,12 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
                            bool noCase, m512 caseMask, m512 mask1,
-                           const struct cb_info *cbi) {
-    const u8 *d = buf;
-    const u8 *e = buf + len;
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
     DEBUG_PRINTF("start %p end %p \n", d, e);
     assert(d < e);
     if (d + 64 >= e) {
@@ -86,8 +87,8 @@ hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,
 
     // peel off first part to cacheline boundary
     const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0,
-                        d1 - d) == HWLM_TERMINATED) {
+    if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
+                        d1 - buf) == HWLM_TERMINATED) {
         return HWLM_TERMINATED;
     }
     d = d1;
@@ -106,16 +107,15 @@ tail:
     DEBUG_PRINTF("d %p e %p \n", d, e);
     // finish off tail
 
-    return scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, d - buf,
+    return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
                            e - buf);
 }
 
 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m512 caseMask, m512 mask1, m512 mask2,
-                             const struct cb_info *cbi, u64a *lastz0,
-                             size_t start, size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
+                             m512 mask2, const struct cb_info *cbi,
+                             u64a *lastz0, size_t start, size_t end) {
     DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
     const u8 *d = buf + start;
     ptrdiff_t scan_len = end - start;
@@ -142,9 +142,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,
-                           size_t keyLen, size_t keyOffset, bool noCase,
-                           m512 caseMask, m512 mask1, m512 mask2,
+hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
+                           bool noCase, m512 caseMask, m512 mask1, m512 mask2,
                            const struct cb_info *cbi, size_t start,
                            size_t end) {
     const u8 *d = buf + start;
@@ -158,9 +157,8 @@ hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,
 
     // peel off first part to cacheline boundary
     const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                        mask1, mask2, cbi, &lastz0, start,
-                        d1 - buf) == HWLM_TERMINATED) {
+    if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                        &lastz0, start, d1 - buf) == HWLM_TERMINATED) {
         return HWLM_TERMINATED;
     }
     d = d1;
@@ -188,6 +186,6 @@ tail:
     DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
     // finish off tail
 
-    return scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                           mask1, mask2, cbi, &lastz0, d - buf, end);
+    return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                           &lastz0, d - buf, end);
 }
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
index 40575409..7cd53d7c 100644
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,8 +38,8 @@ static really_inline m128 getCaseMask(void) {
 }
 
 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m128 caseMask, m128 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
                              const struct cb_info *cbi, size_t start,
                              size_t end) {
     const u8 *d = buf + start;
@@ -67,10 +67,11 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, bool noCase, m128 caseMask,
-                                 m128 mask1, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
     const u8 *d = buf + offset;
     DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
     const size_t l = end - start;
@@ -96,11 +97,10 @@ hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
 }
 
 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m128 caseMask, m128 mask1, m128 mask2,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
+                             m128 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
     const u8 *d = buf + start;
     size_t l = end - start;
     if (!l) {
@@ -128,11 +128,11 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, size_t keyLen, size_t keyOffset,
-                                 bool noCase, m128 caseMask, m128 mask1,
-                                 m128 mask2, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1, m128 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
     const u8 *d = buf + offset;
     DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
     size_t l = end - start;
@@ -158,8 +158,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
 }
 
 static really_inline
-hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, m128 caseMask, m128 mask1,
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
                             const struct cb_info *cbi, size_t start,
                             size_t end) {
     const u8 *d = buf + start, *e = buf + end;
@@ -179,10 +179,9 @@ hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
 }
 
 static really_inline
-hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
-                            m128 caseMask, m128 mask1, m128 mask2,
-                            const struct cb_info *cbi, size_t start,
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
+                            m128 mask2, const struct cb_info *cbi, size_t start,
                             size_t end) {
     const u8 *d = buf + start, *e = buf + end;
     assert(d < e);
diff --git a/src/hwlm/noodle_internal.h b/src/hwlm/noodle_internal.h
index cc287816..8f76f177 100644
--- a/src/hwlm/noodle_internal.h
+++ b/src/hwlm/noodle_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,18 +30,22 @@
  * \brief Data structures for Noodle literal matcher engine.
  */
 
-#ifndef NOODLE_INTERNAL_H_25D751C42E34A6
-#define NOODLE_INTERNAL_H_25D751C42E34A6
+#ifndef NOODLE_INTERNAL_H
+#define NOODLE_INTERNAL_H
 
 #include "ue2common.h"
 
 struct noodTable {
     u32 id;
-    u32 len;
-    u32 key_offset;
-    u8  nocase;
-    u8  str[];
+    u64a msk;
+    u64a cmp;
+    u8 msk_len;
+    u8 key_offset;
+    u8 nocase;
+    u8 single;
+    u8 key0;
+    u8 key1;
 };
 
-#endif /* NOODLE_INTERNAL_H_25D751C42E34A6 */
+#endif /* NOODLE_INTERNAL_H */
 
diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
index 7c56ba72..4508d4f1 100644
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -41,6 +41,8 @@
 #include "util/verify_types.h"
 
 #include <sstream>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #define PATHS_LIMIT 500
@@ -65,6 +67,17 @@ void dump_paths(const Container &paths) {
     DEBUG_PRINTF("%zu paths\n", paths.size());
 }
 
+static
+vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
+    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        rv.at(rdfa.alpha_remap[i]).set(i);
+    }
+
+    return rv;
+}
+
 static
 bool is_useful_path(const vector<path> &good, const path &p) {
     for (const auto &g : good) {
@@ -98,9 +111,10 @@ path append(const path &orig, const CharReach &cr, u32 new_dest) {
 }
 
 static
-void extend(const raw_dfa &rdfa, const path &p,
-            map<u32, vector<path>> &all, vector<path> &out) {
-    dstate s = rdfa.states[p.dest];
+void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
+            const path &p, unordered_map<u32, vector<path>> &all,
+            vector<path> &out) {
+    const dstate &s = rdfa.states[p.dest];
 
     if (!p.reach.empty() && p.reach.back().none()) {
         out.push_back(p);
@@ -125,9 +139,9 @@ void extend(const raw_dfa &rdfa, const path &p,
     }
 
     flat_map<u32, CharReach> dest;
-    for (unsigned i = 0; i < N_CHARS; i++) {
-        u32 succ = s.next[rdfa.alpha_remap[i]];
-        dest[succ].set(i);
+    for (u32 i = 0; i < rev_map.size(); i++) {
+        u32 succ = s.next[i];
+        dest[succ] |= rev_map[i];
     }
 
     for (const auto &e : dest) {
@@ -148,13 +162,14 @@ void extend(const raw_dfa &rdfa, const path &p,
 static
 vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
                                          dstate_id_t base, u32 len) {
+    const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
     vector<path> paths{path(base)};
-    map<u32, vector<path>> all;
+    unordered_map<u32, vector<path>> all;
     all[base].push_back(path(base));
     for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
         vector<path> next_gen;
         for (const auto &p : paths) {
-            extend(rdfa, p, all, next_gen);
+            extend(rdfa, rev_map, p, all, next_gen);
         }
 
         paths = move(next_gen);
@@ -195,17 +210,6 @@ bool better(const AccelScheme &a, const AccelScheme &b) {
     return a.cr.count() < b.cr.count();
 }
 
-static
-vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
-    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
-
-    for (u32 i = 0; i < N_CHARS; i++) {
-        rv.at(rdfa.alpha_remap[i]).set(i);
-    }
-
-    return rv;
-}
-
 static
 bool double_byte_ok(const AccelScheme &info) {
     return !info.double_byte.empty() &&
@@ -225,16 +229,16 @@ bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
 }
 
 static
-vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
-                                        const CharReach &escape) {
-    set<u16> rv;
+flat_set<u16> find_nonexit_symbols(const raw_dfa &rdfa,
+                                   const CharReach &escape) {
+    flat_set<u16> rv;
     CharReach nonexit = ~escape;
-    for (auto i = nonexit.find_first(); i != CharReach::npos;
+    for (auto i = nonexit.find_first(); i != nonexit.npos;
          i = nonexit.find_next(i)) {
         rv.insert(rdfa.alpha_remap[i]);
     }
 
-    return vector<u16>(rv.begin(), rv.end());
+    return rv;
 }
 
 static
@@ -254,7 +258,7 @@ dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
 
     u16 top_remap = raw.alpha_remap[TOP];
 
-    ue2::unordered_set<dstate_id_t> seen;
+    std::unordered_set<dstate_id_t> seen;
     while (true) {
         seen.insert(s);
         DEBUG_PRINTF("basis %hu\n", s);
@@ -288,7 +292,7 @@ dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
 
 static
 set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
-                                    const AccelScheme &ei) {
+                             const AccelScheme &ei) {
     DEBUG_PRINTF("looking for region around %hu\n", base);
 
     set<dstate_id_t> region = {base};
diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp
index 0d19fa8c..4c33b351 100644
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@@ -44,6 +44,8 @@
 #include "util/simd_types.h"
 
 #include <cstdio>
+#include <map>
+#include <set>
 #include <vector>
 
 #ifndef DUMP_SUPPORT
diff --git a/src/nfa/accelcompile.h b/src/nfa/accelcompile.h
index 9bd4ff18..d0b3cdc7 100644
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@@ -31,7 +31,7 @@
 
 #include "ue2common.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 union AccelAux;
 
diff --git a/src/nfa/castle_dump.cpp b/src/nfa/castle_dump.cpp
index 1514ca8c..595b98ec 100644
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,7 +71,7 @@ void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
 void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
     const Castle *c = (const Castle *)getImplNfa(nfa);
 
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
 
     fprintf(f, "Castle multi-tenant repeat engine\n");
     fprintf(f, "\n");
@@ -117,7 +117,6 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
         fprintf(f, "Sub %u:\n", i);
         dumpTextSubCastle(sub[i], f);
     }
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 40fbc18c..5884ebb2 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -48,11 +48,11 @@
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/verify_types.h"
 #include "grey.h"
 
@@ -153,13 +153,11 @@ static
 void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
                      const CliqueVertex &cv, const set<u32> &group) {
     u32 id = g[cv].stateId;
-    ue2::unordered_set<u32> neighborId;
 
     // find neighbors for cv
     for (const auto &v : adjacent_vertices_range(cv, g)) {
-        if (g[v].stateId != id && contains(group, g[v].stateId)){
+        if (g[v].stateId != id && contains(group, g[v].stateId)) {
             neighbor.push_back(g[v].stateId);
-            neighborId.insert(g[v].stateId);
             DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
         }
     }
@@ -772,7 +770,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
         DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = c1.add(pr);
+        u32 new_top = c1.merge(pr);
         top_map[top] = new_top;
         DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
     }
@@ -883,7 +881,7 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
 }
 
 bool requiresDedupe(const CastleProto &proto,
-                    const ue2::flat_set<ReportID> &reports) {
+                    const flat_set<ReportID> &reports) {
     for (const auto &report : reports) {
         auto it = proto.report_map.find(report);
         if (it == end(proto.report_map)) {
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index 9f44692d..ea5f06da 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -39,11 +39,12 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/bytecode_ptr.h"
 #include "util/depth.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <map>
 #include <memory>
 #include <set>
+#include <unordered_map>
 #include <vector>
 
 struct NFA;
@@ -89,7 +90,7 @@ struct CastleProto {
     std::map<u32, PureRepeat> repeats;
 
     /** \brief Mapping from report to associated tops. */
-    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
+    std::unordered_map<ReportID, flat_set<u32>> report_map;
 
     /**
      * \brief Next top id to use. Repeats may be removed without top remapping,
@@ -127,7 +128,9 @@ buildCastle(const CastleProto &proto,
             const CompileContext &cc, const ReportManager &rm);
 
 /**
- * \brief Merge two CastleProto prototypes together, if possible.
+ * \brief Merge two CastleProto prototypes together, if possible. If a
+ * particular repeat from c2 is already in c1, then it will be reused rather
+ * than adding a duplicate repeat.
  *
  * Returns true if merge of all repeats in c2 into c1 succeeds, and fills
  * mapping with the repeat indices.
@@ -155,7 +158,7 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
  * of the reports in the given set.
  */
 bool requiresDedupe(const CastleProto &proto,
-                    const ue2::flat_set<ReportID> &reports);
+                    const flat_set<ReportID> &reports);
 
 /**
  * \brief Build an NGHolder from a CastleProto.
diff --git a/src/nfa/dfa_build_strat.cpp b/src/nfa/dfa_build_strat.cpp
index d4d418aa..b6b7a7fb 100644
--- a/src/nfa/dfa_build_strat.cpp
+++ b/src/nfa/dfa_build_strat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,9 @@
 
 namespace ue2 {
 
-// prevent weak vtables for raw_report_info, dfa_build_strat and raw_dfa
+// prevent weak vtables for raw_report_info, dfa_build_strat
 raw_report_info::~raw_report_info() {}
 
 dfa_build_strat::~dfa_build_strat() {}
 
-raw_dfa::~raw_dfa() {}
-
 } // namespace ue2
diff --git a/src/nfa/dfa_min.cpp b/src/nfa/dfa_min.cpp
index f309cc53..1a07e8a7 100644
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@@ -59,12 +59,13 @@
 #include "dfa_min.h"
 
 #include "grey.h"
+#include "mcclellancompile_util.h"
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/noncopyable.h"
 #include "util/partitioned_set.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <functional>
@@ -299,6 +300,10 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
         return;
     }
 
+    if (is_dead(rdfa)) {
+        DEBUG_PRINTF("dfa is empty\n");
+    }
+
     UNUSED const size_t states_before = rdfa.states.size();
 
     HopcroftInfo info(rdfa);
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 58b05d3d..ba7f2718 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -37,11 +37,11 @@
 #include "nfa_internal.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/verify_types.h"
 
 #include "ue2common.h"
diff --git a/src/nfa/goughcompile.h b/src/nfa/goughcompile.h
index 72469f3c..00da1891 100644
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@@ -33,7 +33,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/order_check.h"
 
 #include <map>
diff --git a/src/nfa/goughcompile_dump.cpp b/src/nfa/goughcompile_dump.cpp
index cb361cdb..96ab196e 100644
--- a/src/nfa/goughcompile_dump.cpp
+++ b/src/nfa/goughcompile_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,8 +32,10 @@
 #include "goughcompile_internal.h"
 #include "grey.h"
 #include "util/container.h"
+#include "util/dump_util.h"
 #include "util/graph_range.h"
 
+#include <sstream>
 #include <string>
 
 #ifndef DUMP_SUPPORT
@@ -66,10 +68,7 @@ string dump_name(const gough_edge_id &e) {
 
 static
 void dump_graph(const GoughGraph &g, const string &base, const Grey &grey) {
-    stringstream ss;
-    ss << grey.dumpPath << "gough_" << base << ".dot";
-
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "gough_" + base + ".dot", "w");
 
     fprintf(f, "digraph NFA {\n");
     fprintf(f, "rankdir=LR;\n");
@@ -94,8 +93,6 @@ void dump_graph(const GoughGraph &g, const string &base, const Grey &grey) {
                 dump_name(g[s]).c_str(), dump_name(g[t]).c_str());
     }
     fprintf(f, "}\n");
-
-    fclose(f);
 }
 
 static
@@ -133,9 +130,7 @@ set<const GoughSSAVar *> uses(const GoughEdgeProps &ep) {
 static
 void dump_var_mapping(const GoughGraph &g, const string &base,
                       const Grey &grey) {
-    stringstream ss;
-    ss << grey.dumpPath << "gough_" << base << "_vars.txt";
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "gough_" + base + "_vars.txt", "w");
     for (auto v : vertices_range(g)) {
         set<const GoughSSAVar *> used = uses(g[v]);
         if (g[v].vars.empty() && used.empty()) {
@@ -180,7 +175,6 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
             fprintf(f, "\n");
         }
     }
-    fclose(f);
 }
 
 static
@@ -220,12 +214,7 @@ void gather_vars(const GoughGraph &g, vector<const GoughSSAVar *> *vars,
 
 static
 void dump_vars(const GoughGraph &g, const string &base, const Grey &grey) {
-    FILE *f;
-    {
-        stringstream ss;
-        ss << grey.dumpPath << "gough_" << base << "_vars.dot";
-        f = fopen(ss.str().c_str(), "w");
-    }
+    StdioFile f(grey.dumpPath + "gough_" + base + "_vars.dot", "w");
     fprintf(f, "digraph NFA {\n");
     fprintf(f, "rankdir=LR;\n");
     fprintf(f, "size=\"11.5,8\"\n");
@@ -271,7 +260,6 @@ void dump_vars(const GoughGraph &g, const string &base, const Grey &grey) {
     }
 
     fprintf(f, "}\n");
-    fclose(f);
 }
 
 void dump(const GoughGraph &g, const string &base, const Grey &grey) {
@@ -317,18 +305,11 @@ void dump_blocks(const map<gough_edge_id, vector<gough_ins>> &blocks,
         return;
     }
 
-    FILE *f;
-    {
-        stringstream ss;
-        ss << grey.dumpPath <<  "gough_" << base << "_programs.txt";
-        f = fopen(ss.str().c_str(), "w");
-    }
+    StdioFile f(grey.dumpPath + "gough_" + base + "_programs.txt", "w");
 
     for (const auto &m : blocks) {
         dump_block(f, m.first, m.second);
     }
-
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/goughcompile_internal.h b/src/nfa/goughcompile_internal.h
index a6ba0d1b..e6454052 100644
--- a/src/nfa/goughcompile_internal.h
+++ b/src/nfa/goughcompile_internal.h
@@ -33,9 +33,9 @@
 #include "mcclellancompile.h"
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/flat_containers.h"
 #include "util/noncopyable.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"
 
 #include <map>
 #include <memory>
@@ -106,10 +106,10 @@ struct GoughSSAVarJoin;
 struct GoughSSAVar : noncopyable {
     GoughSSAVar(void) : seen(false), slot(INVALID_SLOT) {}
     virtual ~GoughSSAVar();
-    const ue2::flat_set<GoughSSAVar *> &get_inputs() const {
+    const flat_set<GoughSSAVar *> &get_inputs() const {
         return inputs;
     }
-    const ue2::flat_set<GoughSSAVarWithInputs *> &get_outputs() const {
+    const flat_set<GoughSSAVarWithInputs *> &get_outputs() const {
         return outputs;
     }
     virtual void replace_input(GoughSSAVar *old_v, GoughSSAVar *new_v) = 0;
@@ -127,8 +127,8 @@ struct GoughSSAVar : noncopyable {
         clear_outputs();
     }
 protected:
-    ue2::flat_set<GoughSSAVar *> inputs;
-    ue2::flat_set<GoughSSAVarWithInputs *> outputs;
+    flat_set<GoughSSAVar *> inputs;
+    flat_set<GoughSSAVarWithInputs *> outputs;
     friend struct GoughSSAVarWithInputs;
     friend struct GoughSSAVarMin;
     friend struct GoughSSAVarJoin;
@@ -184,16 +184,14 @@ struct GoughSSAVarJoin : public GoughSSAVarWithInputs {
 
     void add_input(GoughSSAVar *v, GoughEdge prev);
 
-    const ue2::flat_set<GoughEdge> &get_edges_for_input(GoughSSAVar *input)
-        const;
-    const std::map<GoughSSAVar *, ue2::flat_set<GoughEdge> > &get_input_map()
-        const;
+    const flat_set<GoughEdge> &get_edges_for_input(GoughSSAVar *input) const;
+    const std::map<GoughSSAVar *, flat_set<GoughEdge>> &get_input_map() const;
 
 protected:
     void remove_input_raw(GoughSSAVar *v) override;
 
 private:
-    std::map<GoughSSAVar *, ue2::flat_set<GoughEdge>> input_map;
+    std::map<GoughSSAVar *, flat_set<GoughEdge>> input_map;
 };
 
 struct gough_accel_state_info {
diff --git a/src/nfa/goughcompile_reg.cpp b/src/nfa/goughcompile_reg.cpp
index a9370450..48e515b9 100644
--- a/src/nfa/goughcompile_reg.cpp
+++ b/src/nfa/goughcompile_reg.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,10 +32,10 @@
 #include "gough_internal.h"
 #include "grey.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"
 
 #include "ue2common.h"
 
@@ -235,7 +235,7 @@ void handle_pending_vertices(GoughSSAVar *def, const GoughGraph &g,
     if (contains(aux.containing_v, def)) {
         def_v = aux.containing_v.at(def);
     }
-    ue2::unordered_set<GoughVertex> done;
+    unordered_set<GoughVertex> done;
     while (!pending_vertex.empty()) {
         GoughVertex current = *pending_vertex.begin();
         pending_vertex.erase(current);
diff --git a/src/nfa/goughdump.cpp b/src/nfa/goughdump.cpp
index 1b37a0b1..5f710612 100644
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -353,22 +353,14 @@ void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
 
 void nfaExecGough16_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == GOUGH_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecGough16_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecGough16_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecGough16_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecGough16_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
 void nfaExecGough8_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == GOUGH_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecGough8_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecGough8_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecGough8_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecGough8_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
 } // namespace ue2
diff --git a/src/nfa/lbr_dump.cpp b/src/nfa/lbr_dump.cpp
index 0948e122..89da6871 100644
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,47 +71,40 @@ void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
     assert(nfa);
     assert(nfa->type == LBR_NFA_DOT);
     const lbr_dot *ld = (const lbr_dot *)getImplNfa(nfa);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
     lbrDumpCommon(&ld->common, f);
     fprintf(f, "DOT model\n");
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
     assert(nfa);
     assert(nfa->type == LBR_NFA_VERM);
     const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
-
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-
+    StdioFile f(base + ".txt", "w");
     lbrDumpCommon(&lv->common, f);
     fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
     assert(nfa);
     assert(nfa->type == LBR_NFA_NVERM);
     const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
-
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-
+    StdioFile f(base + ".txt", "w");
     lbrDumpCommon(&lv->common, f);
     fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
     assert(nfa);
     assert(nfa->type == LBR_NFA_SHUF);
 
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
 
     const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa);
     lbrDumpCommon(&ls->common, f);
@@ -122,14 +115,13 @@ void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
             describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
     assert(nfa);
     assert(nfa->type == LBR_NFA_TRUF);
 
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
 
     const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa);
     lbrDumpCommon(&lt->common, f);
@@ -140,7 +132,6 @@ void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
             describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 7183d4b7..6053b56f 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -53,11 +53,13 @@
 #include "util/charreach.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/order_check.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <cassert>
@@ -96,18 +98,20 @@ struct precalcAccel {
 };
 
 struct limex_accel_info {
-    ue2::unordered_set<NFAVertex> accelerable;
+    unordered_set<NFAVertex> accelerable;
     map<NFAStateSet, precalcAccel> precalc;
-    ue2::unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
-    ue2::unordered_map<NFAVertex, AccelScheme> accel_map;
+    unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
+    unordered_map<NFAVertex, AccelScheme> accel_map;
 };
 
 static
-map<NFAVertex, NFAStateSet>
-reindexByStateId(const map<NFAVertex, NFAStateSet> &in, const NGHolder &g,
-                 const ue2::unordered_map<NFAVertex, u32> &state_ids,
+unordered_map<NFAVertex, NFAStateSet>
+reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
+                 const NGHolder &g,
+                 const unordered_map<NFAVertex, u32> &state_ids,
                  const u32 num_states) {
-    map<NFAVertex, NFAStateSet> out;
+    unordered_map<NFAVertex, NFAStateSet> out;
+    out.reserve(in.size());
 
     vector<u32> indexToState(num_vertices(g), NO_STATE);
     for (const auto &m : state_ids) {
@@ -137,18 +141,20 @@ reindexByStateId(const map<NFAVertex, NFAStateSet> &in, const NGHolder &g,
 
 struct build_info {
     build_info(NGHolder &hi,
-               const ue2::unordered_map<NFAVertex, u32> &states_in,
+               const unordered_map<NFAVertex, u32> &states_in,
                const vector<BoundedRepeatData> &ri,
-               const map<NFAVertex, NFAStateSet> &rsmi,
-               const map<NFAVertex, NFAStateSet> &smi,
+               const unordered_map<NFAVertex, NFAStateSet> &rsmi,
+               const unordered_map<NFAVertex, NFAStateSet> &smi,
                const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
-               bool dai, bool sci, const CompileContext &cci,
-               u32 nsi)
-        : h(hi), state_ids(states_in), repeats(ri), tops(ti), zombies(zi),
-          do_accel(dai), stateCompression(sci), cc(cci),
+               bool dai, bool sci, const CompileContext &cci, u32 nsi)
+        : h(hi), state_ids(states_in), repeats(ri), tops(ti), tugs(nsi),
+          zombies(zi), do_accel(dai), stateCompression(sci), cc(cci),
           num_states(nsi) {
         for (const auto &br : repeats) {
-            insert(&tugs, br.tug_triggers);
+            for (auto v : br.tug_triggers) {
+                assert(state_ids.at(v) != NO_STATE);
+                tugs.set(state_ids.at(v));
+            }
             br_cyclic[br.cyclic] =
                 BoundedRepeatSummary(br.repeatMin, br.repeatMax);
         }
@@ -160,15 +166,15 @@ struct build_info {
     }
 
     NGHolder &h;
-    const ue2::unordered_map<NFAVertex, u32> &state_ids;
+    const unordered_map<NFAVertex, u32> &state_ids;
     const vector<BoundedRepeatData> &repeats;
 
     // Squash maps; state sets are indexed by state_id.
-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;
 
     const map<u32, set<NFAVertex>> &tops;
-    ue2::unordered_set<NFAVertex> tugs;
+    NFAStateSet tugs;
     map<NFAVertex, BoundedRepeatSummary> br_cyclic;
     const set<NFAVertex> &zombies;
     bool do_accel;
@@ -238,7 +244,7 @@ bool isLimitedTransition(int from, int to, int maxshift) {
 
 // Fill a bit mask
 template<class Mask>
-void maskFill(Mask &m, char c) {
+void maskFill(Mask &m, u8 c) {
     memset(&m, c, sizeof(m));
 }
 
@@ -478,7 +484,7 @@ bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
 static
 void nfaFindAccelSchemes(const NGHolder &g,
                          const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-                         ue2::unordered_map<NFAVertex, AccelScheme> *out) {
+                         unordered_map<NFAVertex, AccelScheme> *out) {
     vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
 
     NFAVertex sds_or_proxy = get_sds_or_proxy(g);
@@ -503,8 +509,8 @@ void nfaFindAccelSchemes(const NGHolder &g,
 }
 
 struct fas_visitor : public boost::default_bfs_visitor {
-    fas_visitor(const ue2::unordered_map<NFAVertex, AccelScheme> &am_in,
-                ue2::unordered_map<NFAVertex, AccelScheme> *out_in)
+    fas_visitor(const unordered_map<NFAVertex, AccelScheme> &am_in,
+                unordered_map<NFAVertex, AccelScheme> *out_in)
         : accel_map(am_in), out(out_in) {}
 
     void discover_vertex(NFAVertex v, const NGHolder &) {
@@ -515,13 +521,13 @@ struct fas_visitor : public boost::default_bfs_visitor {
             throw this; /* done */
         }
     }
-    const ue2::unordered_map<NFAVertex, AccelScheme> &accel_map;
-    ue2::unordered_map<NFAVertex, AccelScheme> *out;
+    const unordered_map<NFAVertex, AccelScheme> &accel_map;
+    unordered_map<NFAVertex, AccelScheme> *out;
 };
 
 static
 void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
-                       ue2::unordered_map<NFAVertex, AccelScheme> *accel_map) {
+                       unordered_map<NFAVertex, AccelScheme> *accel_map) {
     /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
      * else should be ditched. We use a simple BFS to choose accel states near
      * the start. */
@@ -541,14 +547,12 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
         tempEdges.push_back(e); // Remove edge later.
     }
 
-    ue2::unordered_map<NFAVertex, AccelScheme> out;
+    unordered_map<NFAVertex, AccelScheme> out;
 
     try {
-        vector<boost::default_color_type> colour(num_vertices(g));
         boost::breadth_first_search(g, g.start,
-            visitor(fas_visitor(*accel_map, &out))
-                .color_map(make_iterator_property_map(colour.begin(),
-                                                      get(vertex_index, g))));
+                                    visitor(fas_visitor(*accel_map, &out))
+                                        .color_map(make_small_color_map(g)));
     } catch (fas_visitor *) {
         ; /* found max accel_states */
     }
@@ -983,16 +987,18 @@ u32 addSquashMask(const build_info &args, const NFAVertex &v,
     return idx;
 }
 
+using ReportListCache = ue2_unordered_map<vector<ReportID>, u32>;
+
 static
 u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
-               unordered_map<vector<ReportID>, u32> &reportListCache) {
+               ReportListCache &reports_cache) {
     assert(!r.empty());
 
     vector<ReportID> my_reports(begin(r), end(r));
     my_reports.push_back(MO_INVALID_IDX); // sentinel
 
-    auto cache_it = reportListCache.find(my_reports);
-    if (cache_it != end(reportListCache)) {
+    auto cache_it = reports_cache.find(my_reports);
+    if (cache_it != end(reports_cache)) {
         u32 offset = cache_it->second;
         DEBUG_PRINTF("reusing cached report list at %u\n", offset);
         return offset;
@@ -1008,13 +1014,12 @@ u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
 
     u32 offset = verify_u32(reports.size());
     insert(&reports, reports.end(), my_reports);
-    reportListCache.emplace(move(my_reports), offset);
+    reports_cache.emplace(move(my_reports), offset);
     return offset;
 }
 
 static
-void buildAcceptsList(const build_info &args,
-                      unordered_map<vector<ReportID>, u32> &reports_cache,
+void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
                       vector<NFAVertex> &verts, vector<NFAAccept> &accepts,
                       vector<ReportID> &reports, vector<NFAStateSet> &squash) {
     if (verts.empty()) {
@@ -1052,8 +1057,7 @@ void buildAcceptsList(const build_info &args,
 }
 
 static
-void buildAccepts(const build_info &args,
-                  unordered_map<vector<ReportID>, u32> &reports_cache,
+void buildAccepts(const build_info &args, ReportListCache &reports_cache,
                   NFAStateSet &acceptMask, NFAStateSet &acceptEodMask,
                   vector<NFAAccept> &accepts, vector<NFAAccept> &acceptsEod,
                   vector<ReportID> &reports, vector<NFAStateSet> &squash) {
@@ -1120,7 +1124,7 @@ u32 uncompressedStateSize(u32 num_states) {
 
 static
 u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,
-                        const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+                        const unordered_map<NFAVertex, u32> &state_ids) {
     // Shrink state requirement to enough to fit the compressed largest reach.
     vector<u32> allreach(N_CHARS, 0);
 
@@ -1191,7 +1195,7 @@ bool hasSquashableInitDs(const build_info &args) {
 
 static
 bool hasInitDsStates(const NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+                     const unordered_map<NFAVertex, u32> &state_ids) {
     if (state_ids.at(h.startDs) != NO_STATE) {
         return true;
     }
@@ -1359,17 +1363,16 @@ struct ExceptionProto {
 };
 
 static
-u32 buildExceptionMap(const build_info &args,
-                      unordered_map<vector<ReportID>, u32> &reports_cache,
-                      const ue2::unordered_set<NFAEdge> &exceptional,
+u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
+                      const unordered_set<NFAEdge> &exceptional,
                       map<ExceptionProto, vector<u32>> &exceptionMap,
                       vector<ReportID> &reportList) {
     const NGHolder &h = args.h;
     const u32 num_states = args.num_states;
     u32 exceptionCount = 0;
 
-    ue2::unordered_map<NFAVertex, u32> pos_trigger;
-    ue2::unordered_map<NFAVertex, u32> tug_trigger;
+    unordered_map<NFAVertex, u32> pos_trigger;
+    unordered_map<NFAVertex, u32> tug_trigger;
 
     for (u32 i = 0; i < args.repeats.size(); i++) {
         const BoundedRepeatData &br = args.repeats[i];
@@ -1518,18 +1521,14 @@ u32 depth_to_u32(const depth &d) {
 }
 
 static
-bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
-                             const build_info &args, u32 maxShift) {
-    NFAVertex from = source(e, h);
-    NFAVertex to = target(e, h);
-    u32 f = args.state_ids.at(from);
-    u32 t = args.state_ids.at(to);
-    if (!isLimitedTransition(f, t, maxShift)) {
+bool isExceptionalTransition(u32 from, u32 to, const build_info &args,
+                             u32 maxShift) {
+    if (!isLimitedTransition(from, to, maxShift)) {
         return true;
     }
 
     // All transitions out of a tug trigger are exceptional.
-    if (contains(args.tugs, from)) {
+    if (args.tugs.test(from)) {
         return true;
     }
     return false;
@@ -1545,7 +1544,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
         if (from == NO_STATE || to == NO_STATE) {
             continue;
         }
-        if (!isExceptionalTransition(h, e, args, MAX_SHIFT_AMOUNT)) {
+        if (!isExceptionalTransition(from, to, args, MAX_SHIFT_AMOUNT)) {
             shiftMask |= (1UL << (to - from));
         }
     }
@@ -1574,7 +1573,7 @@ int getLimexScore(const build_info &args, u32 nShifts) {
         if (from == NO_STATE || to == NO_STATE) {
             continue;
         }
-        if (isExceptionalTransition(h, e, args, maxVarShift)) {
+        if (isExceptionalTransition(from, to, args, maxVarShift)) {
             exceptionalStates.set(from);
         }
     }
@@ -1615,9 +1614,7 @@ bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
     // top, looking for a cyclic path consisting of vertices of dot reach. If
     // one exists, than the NFA cannot die after this top is triggered.
 
-    vector<boost::default_color_type> colours(num_vertices(h));
-    auto colour_map = boost::make_iterator_property_map(colours.begin(),
-                                                        get(vertex_index, h));
+    auto colour_map = make_small_color_map(h);
 
     struct CycleFound {};
     struct CannotDieVisitor : public boost::default_dfs_visitor {
@@ -1848,10 +1845,9 @@ struct Factory {
             maskSetBit(limex->repeatCyclicMask, cyclic);
         }
         /* also include tugs in repeat cyclic mask */
-        for (NFAVertex v : args.tugs) {
-            u32 v_state = args.state_ids.at(v);
-            assert(v_state != NO_STATE);
-            maskSetBit(limex->repeatCyclicMask, v_state);
+        for (size_t i = args.tugs.find_first(); i != args.tugs.npos;
+             i = args.tugs.find_next(i)) {
+            maskSetBit(limex->repeatCyclicMask, i);
         }
     }
 
@@ -1872,7 +1868,7 @@ struct Factory {
             // We check for exceptional transitions here, as we don't want tug
             // trigger transitions emitted as limited transitions (even if they
             // could be in this model).
-            if (!isExceptionalTransition(h, e, args, maxShift)) {
+            if (!isExceptionalTransition(from, to, args, maxShift)) {
                 u32 shift = to - from;
                 if ((shiftMask & (1UL << shift)) == 0UL) {
                     shiftMask |= (1UL << shift);
@@ -1896,7 +1892,7 @@ struct Factory {
 
     static
     void findExceptionalTransitions(const build_info &args,
-                                    ue2::unordered_set<NFAEdge> &exceptional,
+                                    unordered_set<NFAEdge> &exceptional,
                                     u32 maxShift) {
         const NGHolder &h = args.h;
 
@@ -1907,7 +1903,7 @@ struct Factory {
                 continue;
             }
 
-            if (isExceptionalTransition(h, e, args, maxShift)) {
+            if (isExceptionalTransition(from, to, args, maxShift)) {
                 exceptional.insert(e);
             }
         }
@@ -2171,9 +2167,9 @@ struct Factory {
 
         // We track report lists that have already been written into the global
         // list in case we can reuse them.
-        unordered_map<vector<ReportID>, u32> reports_cache;
+        ReportListCache reports_cache;
 
-        ue2::unordered_set<NFAEdge> exceptional;
+        unordered_set<NFAEdge> exceptional;
         u32 shiftCount = findBestNumOfVarShifts(args);
         assert(shiftCount);
         u32 maxShift = findMaxVarShift(args, shiftCount);
@@ -2377,10 +2373,10 @@ MAKE_LIMEX_TRAITS(512)
 // Some sanity tests, called by an assertion in generate().
 static UNUSED
 bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids,
+            const unordered_map<NFAVertex, u32> &state_ids,
             u32 num_states) {
-    ue2::unordered_set<u32> seen;
-    ue2::unordered_set<NFAVertex> top_starts;
+    unordered_set<u32> seen;
+    unordered_set<NFAVertex> top_starts;
     for (const auto &vv : tops | map_values) {
         insert(&top_starts, vv);
     }
@@ -2427,7 +2423,7 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
 #endif // NDEBUG
 
 static
-u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
     u32 rv = 0;
     for (const auto &m : state_ids) {
         DEBUG_PRINTF("state %u\n", m.second);
@@ -2440,14 +2436,14 @@ u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
 }
 
 bytecode_ptr<NFA> generate(NGHolder &h,
-                           const ue2::unordered_map<NFAVertex, u32> &states,
-                           const vector<BoundedRepeatData> &repeats,
-                           const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                           const map<NFAVertex, NFAStateSet> &squashMap,
-                           const map<u32, set<NFAVertex>> &tops,
-                           const set<NFAVertex> &zombies, bool do_accel,
-                           bool stateCompression, u32 hint,
-                           const CompileContext &cc) {
+                const unordered_map<NFAVertex, u32> &states,
+                const vector<BoundedRepeatData> &repeats,
+                const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+                const unordered_map<NFAVertex, NFAStateSet> &squashMap,
+                const map<u32, set<NFAVertex>> &tops,
+                const set<NFAVertex> &zombies, bool do_accel,
+                bool stateCompression, u32 hint,
+                const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
     DEBUG_PRINTF("total states: %u\n", num_states);
 
@@ -2510,13 +2506,13 @@ bytecode_ptr<NFA> generate(NGHolder &h,
 }
 
 u32 countAccelStates(NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &states,
-                     const vector<BoundedRepeatData> &repeats,
-                     const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                     const map<NFAVertex, NFAStateSet> &squashMap,
-                     const map<u32, set<NFAVertex>> &tops,
-                     const set<NFAVertex> &zombies,
-                     const CompileContext &cc) {
+                const unordered_map<NFAVertex, u32> &states,
+                const vector<BoundedRepeatData> &repeats,
+                const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+                const unordered_map<NFAVertex, NFAStateSet> &squashMap,
+                const map<u32, set<NFAVertex>> &tops,
+                const set<NFAVertex> &zombies,
+                const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
     DEBUG_PRINTF("total states: %u\n", num_states);
 
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index a12ae9f6..a08e0ae5 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -34,15 +34,16 @@
 #ifndef LIMEX_COMPILE_H
 #define LIMEX_COMPILE_H
 
-#include <map>
-#include <memory>
-#include <vector>
-
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_squash.h" // for NFAStateSet
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"
+
+#include <set>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
 
 struct NFA;
 
@@ -69,16 +70,16 @@ struct CompileContext;
  * graph.
  */
 bytecode_ptr<NFA> generate(NGHolder &g,
-                        const ue2::unordered_map<NFAVertex, u32> &states,
-                        const std::vector<BoundedRepeatData> &repeats,
-                        const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
-                        const std::map<NFAVertex, NFAStateSet> &squashMap,
-                        const std::map<u32, std::set<NFAVertex>> &tops,
-                        const std::set<NFAVertex> &zombies,
-                        bool do_accel,
-                        bool stateCompression,
-                        u32 hint,
-                        const CompileContext &cc);
+            const std::unordered_map<NFAVertex, u32> &states,
+            const std::vector<BoundedRepeatData> &repeats,
+            const std::unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+            const std::unordered_map<NFAVertex, NFAStateSet> &squashMap,
+            const std::map<u32, std::set<NFAVertex>> &tops,
+            const std::set<NFAVertex> &zombies,
+            bool do_accel,
+            bool stateCompression,
+            u32 hint,
+            const CompileContext &cc);
 
 /**
  * \brief For a given graph, count the number of accelerable states it has.
@@ -87,13 +88,13 @@ bytecode_ptr<NFA> generate(NGHolder &g,
  * implementable.
  */
 u32 countAccelStates(NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &states,
-                     const std::vector<BoundedRepeatData> &repeats,
-                     const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
-                     const std::map<NFAVertex, NFAStateSet> &squashMap,
-                     const std::map<u32, std::set<NFAVertex>> &tops,
-                     const std::set<NFAVertex> &zombies,
-                     const CompileContext &cc);
+            const std::unordered_map<NFAVertex, u32> &states,
+            const std::vector<BoundedRepeatData> &repeats,
+            const std::unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+            const std::unordered_map<NFAVertex, NFAStateSet> &squashMap,
+            const std::map<u32, std::set<NFAVertex>> &tops,
+            const std::set<NFAVertex> &zombies,
+            const CompileContext &cc);
 
 } // namespace ue2
 
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 797e87ba..9256c841 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -487,25 +487,24 @@ void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
     }
 }
 
+template<typename limex_type>
+static
+void dumpLimexDot(const NFA *nfa, const limex_type *limex, FILE *f) {
+    dumpDotPreamble(f);
+    u32 state_count = nfa->nPositions;
+    dumpVertexDotInfo(limex, state_count, f, limex_labeller<limex_type>(limex));
+    for (u32 i = 0; i < state_count; i++) {
+        dumpLimDotInfo(limex, i, f);
+        dumpExDotInfo(limex, i, f);
+    }
+    dumpDotTrailer(f);
+}
+
 #define LIMEX_DUMP_FN(size)                                                    \
     void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) {       \
         auto limex = (const LimExNFA##size *)getImplNfa(nfa);                  \
-                                                                               \
-        FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");                \
-        dumpLimexText(limex, f);                                               \
-        fclose(f);                                                             \
-                                                                               \
-        f = fopen_or_throw((base + ".dot").c_str(), "w");                      \
-        dumpDotPreamble(f);                                                    \
-        u32 state_count = nfa->nPositions;                                     \
-        dumpVertexDotInfo(limex, state_count, f,                               \
-                          limex_labeller<LimExNFA##size>(limex));              \
-        for (u32 i = 0; i < state_count; i++) {                                \
-            dumpLimDotInfo(limex, i, f);                                       \
-            dumpExDotInfo(limex, i, f);                                        \
-        }                                                                      \
-        dumpDotTrailer(f);                                                     \
-        fclose(f);                                                             \
+        dumpLimexText(limex, StdioFile(base + ".txt", "w"));                   \
+        dumpLimexDot(nfa, limex, StdioFile(base + ".dot", "w"));               \
     }
 
 LIMEX_DUMP_FN(32)
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index e875477b..d705ddf9 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -46,7 +46,7 @@
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/unaligned.h"
 #include "util/verify_types.h"
 
@@ -288,11 +288,12 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
 
         raw_report_list rrl(s.reports, rm, remap_reports);
         DEBUG_PRINTF("non empty r\n");
-        if (rev.find(rrl) != rev.end()) {
-            reports.push_back(rev[rrl]);
+        auto it = rev.find(rrl);
+        if (it != rev.end()) {
+            reports.push_back(it->second);
         } else {
             DEBUG_PRINTF("adding to rl %zu\n", ri->size());
-            rev[rrl] = ri->size();
+            rev.emplace(rrl, ri->size());
             reports.push_back(ri->size());
             ri->rl.push_back(rrl);
         }
@@ -306,13 +307,14 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
 
         DEBUG_PRINTF("non empty r eod\n");
         raw_report_list rrl(s.reports_eod, rm, remap_reports);
-        if (rev.find(rrl) != rev.end()) {
-            reports_eod.push_back(rev[rrl]);
+        auto it = rev.find(rrl);
+        if (it != rev.end()) {
+            reports_eod.push_back(it->second);
             continue;
         }
 
         DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
-        rev[rrl] = ri->size();
+        rev.emplace(rrl, ri->size());
         reports_eod.push_back(ri->size());
         ri->rl.push_back(rrl);
     }
@@ -325,10 +327,9 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
         *arbReport = 0;
     }
 
-
     /* if we have only a single report id generated from all accepts (not eod)
      * we can take some short cuts */
-    set<ReportID> reps;
+    flat_set<ReportID> reps;
 
     for (u32 rl_index : reports) {
         if (rl_index == MO_INVALID_IDX) {
@@ -897,7 +898,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
     }
 
     u32 self_loop_width = 0;
-    const dstate curr_raw = info.states[curr_id];
+    const dstate &curr_raw = info.states[curr_id];
     for (unsigned i = 0; i < N_CHARS; i++) {
         if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
             self_loop_width++;
@@ -914,33 +915,6 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
     info.extra[curr_id].shermanState = true;
 }
 
-/*
- * Calls accessible outside this module.
- */
-
-u16 raw_dfa::getImplAlphaSize() const {
-    return alpha_size - N_SPECIAL_SYMBOL;
-}
-
-void raw_dfa::stripExtraEodReports(void) {
-    /* if a state generates a given report as a normal accept - then it does
-     * not also need to generate an eod report for it */
-    for (dstate &ds : states) {
-        for (const ReportID &report : ds.reports) {
-            ds.reports_eod.erase(report);
-        }
-    }
-}
-
-bool raw_dfa::hasEodReports(void) const {
-    for (const dstate &ds : states) {
-        if (!ds.reports_eod.empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
     symbol_t alphasize = raw.getImplAlphaSize();
@@ -964,7 +938,8 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                                      const CompileContext &cc,
                                      bool trust_daddy_states,
                                      set<dstate_id_t> *accel_states) {
-    u16 total_daddy = 0;
+    assert(!is_dead(raw));
+
     dfa_info info(strat);
     bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
 
@@ -974,21 +949,24 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
     }
 
     bool has_eod_reports = raw.hasEodReports();
-    bool any_cyclic_near_anchored_state = is_cyclic_near(raw,
-                                                         raw.start_anchored);
-
-    for (u32 i = 0; i < info.size(); i++) {
-        find_better_daddy(info, i, using8bit, any_cyclic_near_anchored_state,
-                          trust_daddy_states, cc.grey);
-        total_daddy += info.extra[i].daddytaken;
-    }
-
-    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                 info.size() * info.impl_alpha_size, info.size(),
-                 info.impl_alpha_size);
 
     bytecode_ptr<NFA> nfa;
     if (!using8bit) {
+        u16 total_daddy = 0;
+        bool any_cyclic_near_anchored_state
+            = is_cyclic_near(raw, raw.start_anchored);
+
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i, using8bit,
+                              any_cyclic_near_anchored_state,
+                              trust_daddy_states, cc.grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
+
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+
         nfa = mcclellanCompile16(info, cc, accel_states);
     } else {
         nfa = mcclellanCompile8(info, cc, accel_states);
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index baf72d9c..ce63fbbf 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -33,7 +33,6 @@
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"
 
 #include <memory>
 #include <vector>
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index 17e022fe..3e299b81 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -30,12 +30,11 @@
 
 #include "rdfa.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/hash.h"
 #include "ue2common.h"
 
 #include <deque>
-
-#include <boost/functional/hash/hash.hpp>
+#include <map>
 
 using namespace std;
 
@@ -127,13 +126,11 @@ u32 remove_leading_dots(raw_dfa &raw) {
 static never_inline
 u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
     vector<u32> &dist = *dist_in;
-    dist.clear();
-    dist.resize(raw.states.size(), ~0U);
+    dist.assign(raw.states.size(), ~0U);
 
     assert(raw.start_anchored != DEAD_STATE);
 
-    deque<dstate_id_t> to_visit;
-    to_visit.push_back(raw.start_anchored);
+    deque<dstate_id_t> to_visit = { raw.start_anchored };
     dist[raw.start_anchored] = 0;
 
     u32 last_d = 0;
@@ -148,8 +145,7 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
         assert(d >= last_d);
         assert(d != ~0U);
 
-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t t = raw.states[s].next[j];
+        for (dstate_id_t t : raw.states[s].next) {
             if (t == DEAD_STATE) {
                 continue;
             }
@@ -187,7 +183,21 @@ bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
         }
     }
 
-    return changed;
+    if (!changed) {
+        return false;
+    }
+
+    // We may have cleared all reports from the DFA, in which case it should
+    // become empty.
+    if (all_of_in(raw.states, [](const dstate &ds) {
+            return ds.reports.empty() && ds.reports_eod.empty();
+        })) {
+        DEBUG_PRINTF("no reports left at all, dfa is dead\n");
+        raw.start_anchored = DEAD_STATE;
+        raw.start_floating = DEAD_STATE;
+    }
+
+    return true;
 }
 
 set<ReportID> all_reports(const raw_dfa &rdfa) {
@@ -218,22 +228,18 @@ bool has_non_eod_accepts(const raw_dfa &rdfa) {
 }
 
 size_t hash_dfa_no_reports(const raw_dfa &rdfa) {
-    using boost::hash_combine;
-    using boost::hash_range;
-
     size_t v = 0;
     hash_combine(v, rdfa.alpha_size);
-    hash_combine(v, hash_range(begin(rdfa.alpha_remap), end(rdfa.alpha_remap)));
+    hash_combine(v, rdfa.alpha_remap);
 
     for (const auto &ds : rdfa.states) {
-        hash_combine(v, hash_range(begin(ds.next), end(ds.next)));
+        hash_combine(v, ds.next);
     }
 
     return v;
 }
 
 size_t hash_dfa(const raw_dfa &rdfa) {
-    using boost::hash_combine;
     size_t v = 0;
     hash_combine(v, hash_dfa_no_reports(rdfa));
     hash_combine(v, all_reports(rdfa));
@@ -272,4 +278,9 @@ bool can_die_early(const raw_dfa &raw, u32 age_limit) {
     return can_die_early(raw, raw.start_anchored, visited, age_limit);
 }
 
+bool is_dead(const raw_dfa &rdfa) {
+    return rdfa.start_anchored == DEAD_STATE &&
+           rdfa.start_floating == DEAD_STATE;
+}
+
 } // namespace ue2
diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h
index d681e06b..bc730cdd 100644
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -59,6 +59,13 @@ size_t hash_dfa(const raw_dfa &rdfa);
 
 bool can_die_early(const raw_dfa &raw, u32 age_limit);
 
+/**
+ * \brief Returns true if this DFA cannot match, i.e. its start state is
+ * DEAD_STATE.
+ */
+bool is_dead(const raw_dfa &rdfa);
+
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfa/mcclellandump.cpp b/src/nfa/mcclellandump.cpp
index 9e04ad63..a13795fd 100644
--- a/src/nfa/mcclellandump.cpp
+++ b/src/nfa/mcclellandump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -442,22 +442,14 @@ void nfaExecMcClellan8_dumpText(const NFA *nfa, FILE *f) {
 
 void nfaExecMcClellan16_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecMcClellan16_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecMcClellan16_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecMcClellan16_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecMcClellan16_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
 void nfaExecMcClellan8_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecMcClellan8_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecMcClellan8_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecMcClellan8_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecMcClellan8_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
 } // namespace ue2
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
index 2049fee0..871ca4fb 100644
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -45,13 +45,14 @@
 #include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/unaligned.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
 
 #include <algorithm>
@@ -383,6 +384,8 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 #define MAX_SHENG_STATES 16
 #define MAX_SHENG_LEAKINESS 0.05
 
+using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
+
 /**
  * Returns the proportion of strings of length 'depth' which will leave the
  * sheng region when starting at state 'u'.
@@ -390,8 +393,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 static
 double leakiness(const RdfaGraph &g, dfa_info &info,
                  const flat_set<RdfaVertex> &sheng_states, RdfaVertex u,
-                 u32 depth,
-                 unordered_map<pair<RdfaVertex, u32>, double> &cache) {
+                 u32 depth, LeakinessCache &cache) {
     double rv = 0;
     if (contains(cache, make_pair(u, depth))) {
         return cache[make_pair(u, depth)];
@@ -426,7 +428,7 @@ double leakiness(const RdfaGraph &g, dfa_info &info,
 static
 double leakiness(const RdfaGraph &g, dfa_info &info,
                  const flat_set<RdfaVertex> &sheng_states, RdfaVertex u) {
-    unordered_map<pair<RdfaVertex, u32>, double> cache;
+    LeakinessCache cache;
     double rv = leakiness(g, info, sheng_states, u, 8, cache);
     return rv;
 }
@@ -738,7 +740,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
     assert(info.is_normal(currState.daddy));
 
     u32 self_loop_width = 0;
-    const dstate curr_raw = info.states[curr_id];
+    const dstate &curr_raw = info.states[curr_id];
     for (unsigned i = 0; i < N_CHARS; i++) {
         if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
             self_loop_width++;
diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp
index f5c058af..2b563079 100644
--- a/src/nfa/mcsheng_dump.cpp
+++ b/src/nfa/mcsheng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -394,22 +394,14 @@ void dump_text_8(const NFA *nfa, FILE *f) {
 
 void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCSHENG_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    dump_text_16(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    dump_dot_16(nfa, f);
-    fclose(f);
+    dump_text_16(nfa, StdioFile(base + ".txt", "w"));
+    dump_dot_16(nfa, StdioFile(base + ".dot", "w"));
 }
 
 void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCSHENG_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    dump_text_8(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    dump_dot_8(nfa, f);
-    fclose(f);
+    dump_text_8(nfa, StdioFile(base + ".txt", "w"));
+    dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
 }
 
 } // namespace ue2
diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index 9a8a4067..4979395d 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -132,7 +132,7 @@ void dumpCounter(FILE *f, const mpv_counter_info *c) {
 void nfaExecMpv_dump(const NFA *nfa, const string &base) {
     const mpv *m = (const mpv *)getImplNfa(nfa);
 
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
 
     fprintf(f, "Puff the Magic Engines\n");
     fprintf(f, "\n");
@@ -154,7 +154,6 @@ void nfaExecMpv_dump(const NFA *nfa, const string &base) {
     }
 
     dumpTextReverse(nfa, f);
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfa/rdfa.cpp b/src/nfa/rdfa.cpp
new file mode 100644
index 00000000..ae857b6a
--- /dev/null
+++ b/src/nfa/rdfa.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rdfa.h"
+
+namespace ue2 {
+
+// prevent weak vtables
+raw_dfa::~raw_dfa() {}
+
+void raw_dfa::stripExtraEodReports(void) {
+    /* if a state generates a given report as a normal accept - then it does
+     * not also need to generate an eod report for it */
+    for (dstate &ds : states) {
+        for (const ReportID &report : ds.reports) {
+            ds.reports_eod.erase(report);
+        }
+    }
+}
+
+bool raw_dfa::hasEodReports(void) const {
+    for (const dstate &ds : states) {
+        if (!ds.reports_eod.empty()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace ue2
diff --git a/src/nfa/rdfa.h b/src/nfa/rdfa.h
index fc60f177..6b994e4f 100644
--- a/src/nfa/rdfa.h
+++ b/src/nfa/rdfa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"
 
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <array>
 #include <vector>
@@ -81,7 +81,7 @@ struct raw_dfa {
     explicit raw_dfa(nfa_kind k) : kind(k) {}
     virtual ~raw_dfa();
 
-    u16 getImplAlphaSize() const;
+    u16 getImplAlphaSize() const { return alpha_size - N_SPECIAL_SYMBOL; }
     virtual void stripExtraEodReports(void);
     bool hasEodReports(void) const;
 };
diff --git a/src/nfa/rdfa_merge.cpp b/src/nfa/rdfa_merge.cpp
index 50e9b62a..2ad87123 100644
--- a/src/nfa/rdfa_merge.cpp
+++ b/src/nfa/rdfa_merge.cpp
@@ -36,9 +36,10 @@
 #include "nfagraph/ng_mcclellan_internal.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/make_unique.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <queue>
@@ -53,8 +54,8 @@ namespace {
 
 class Automaton_Merge {
 public:
-    typedef vector<u16> StateSet;
-    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
+    using StateSet = vector<u16>;
+    using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
 
     Automaton_Merge(const raw_dfa *rdfa1, const raw_dfa *rdfa2,
                     const ReportManager *rm_in, const Grey &grey_in)
@@ -289,7 +290,7 @@ unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
     auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);
 
     Automaton_Merge autom(d1, d2, rm, grey);
-    if (!determinise(autom, rdfa->states, max_states)) {
+    if (determinise(autom, rdfa->states, max_states)) {
         rdfa->start_anchored = autom.start_anchored;
         rdfa->start_floating = autom.start_floating;
         rdfa->alpha_size = autom.alphasize;
@@ -374,7 +375,7 @@ unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
 
     DEBUG_PRINTF("merging dfa\n");
 
-    if (determinise(n, rdfa->states, max_states)) {
+    if (!determinise(n, rdfa->states, max_states)) {
         DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
         return nullptr; /* over state limit */
     }
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
index 9885cd16..2fe1e356 100644
--- a/src/nfa/shengcompile.h
+++ b/src/nfa/shengcompile.h
@@ -33,7 +33,10 @@
 #include "rdfa.h"
 #include "util/bytecode_ptr.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
+
+#include <memory>
+#include <set>
 
 struct NFA;
 
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
index ce87beaf..99fda76f 100644
--- a/src/nfa/shengdump.cpp
+++ b/src/nfa/shengdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,7 +41,6 @@
 #include "util/dump_util.h"
 #include "util/simd_types.h"
 
-
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
@@ -267,12 +266,8 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
 
 void nfaExecSheng_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == SHENG_NFA);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecSheng_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecSheng_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
 } // namespace ue2
diff --git a/src/nfa/shufticompile.cpp b/src/nfa/shufticompile.cpp
index 12a94b7b..f712ef94 100644
--- a/src/nfa/shufticompile.cpp
+++ b/src/nfa/shufticompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <array>
 #include <cassert>
diff --git a/src/nfa/shufticompile.h b/src/nfa/shufticompile.h
index a72904e0..59b9c38d 100644
--- a/src/nfa/shufticompile.h
+++ b/src/nfa/shufticompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 
 #include "ue2common.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <utility>
 
diff --git a/src/nfa/tamarama_dump.cpp b/src/nfa/tamarama_dump.cpp
index 88cb33cc..e6d34f7c 100644
--- a/src/nfa/tamarama_dump.cpp
+++ b/src/nfa/tamarama_dump.cpp
@@ -27,7 +27,7 @@
  */
 
 /** \file
- * \brief Tamarama: container engine for exclusve engines, dump code.
+ * \brief Tamarama: container engine for exclusive engines, dump code.
  */
 
 #include "config.h"
@@ -54,7 +54,7 @@ namespace ue2 {
 void nfaExecTamarama_dump(const struct NFA *nfa, const string &base) {
     const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
 
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
 
     fprintf(f, "Tamarama container engine\n");
     fprintf(f, "\n");
@@ -63,7 +63,6 @@ void nfaExecTamarama_dump(const struct NFA *nfa, const string &base) {
     fprintf(f, "\n");
     dumpTextReverse(nfa, f);
     fprintf(f, "\n");
-    fclose(f);
 
     const u32 *subOffset =
         (const u32 *)((const char *)t + sizeof(struct Tamarama) +
diff --git a/src/nfa/trufflecompile.cpp b/src/nfa/trufflecompile.cpp
index 9442d046..f19de0ee 100644
--- a/src/nfa/trufflecompile.cpp
+++ b/src/nfa/trufflecompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,15 @@
  * truffle is always able to represent an entire character class, providing a
  * backstop to other acceleration engines.
  */
+
 #include "trufflecompile.h"
+
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/dump_mask.h"
 #include "util/simd_types.h"
 
-#include "util/dump_mask.h"
+#include <cstring>
 
 using namespace std;
 
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index a5a5c235..a1304583 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -44,7 +44,6 @@
 #include "util/graph.h"
 #include "util/noncopyable.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 
 #include <deque>
 #include <map>
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index bfe73eb2..65574b50 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -220,6 +220,52 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
     return shell_edges;
 }
 
+template<typename GetAdjRange>
+bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &shell,
+                     GetAdjRange adj_range_func) {
+    if (shell.empty()) {
+        DEBUG_PRINTF("no shell\n");
+        return false;
+    }
+
+    NFAVertex exit_vertex = NGHolder::null_vertex();
+    for (auto u : shell) {
+        for (auto v : adj_range_func(u, g)) {
+            if (contains(shell, v)) {
+                continue;
+            }
+            if (!exit_vertex) {
+                exit_vertex = v;
+                continue;
+            }
+            if (exit_vertex == v) {
+                continue;
+            }
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * True if all edges out of vertices in the head shell lead to at most a single
+ * outside vertex, or the inverse for the tail shell.
+ */
+static
+bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &head_shell,
+                     const flat_set<NFAVertex> &tail_shell) {
+    if (shellHasOnePath(g, head_shell, adjacent_vertices_range<NGHolder>)) {
+        DEBUG_PRINTF("head shell has only one path through it\n");
+        return true;
+    }
+    if (shellHasOnePath(g, tail_shell, inv_adjacent_vertices_range<NGHolder>)) {
+        DEBUG_PRINTF("tail shell has only one path into it\n");
+        return true;
+    }
+    return false;
+}
+
 /**
  * Common code called by calc- and recalc- below. Splits the given holder into
  * one or more connected components, adding them to the comps deque.
@@ -250,16 +296,25 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
         return;
     }
 
+    // Find edges connecting the head and tail shells directly.
     vector<NFAEdge> shell_edges = findShellEdges(*g, head_shell, tail_shell);
 
     DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n",
                  head_shell.size(), tail_shell.size(), shell_edges.size());
 
-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
+    // If there are no shell edges and only one path out of the head shell or
+    // into the tail shell, we aren't going to find more than one component.
+    if (shell_edges.empty() && shellHasOnePath(*g, head_shell, tail_shell)) {
+        DEBUG_PRINTF("single component\n");
+        comps.push_back(std::move(g));
+        return;
+    }
+
+    unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
     auto ug = createUnGraph(*g, true, true, old2new);
 
     // Construct reverse mapping.
-    ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
+    unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
     for (const auto &m : old2new) {
         new2old.emplace(m.second, m.first);
     }
@@ -301,7 +356,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
         DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c);
     }
 
-    ue2::unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
+    unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
     for (auto &vv : verts) {
         // Shells are in every component.
         vv.insert(vv.end(), begin(head_shell), end(head_shell));
diff --git a/src/nfagraph/ng_cyclic_redundancy.cpp b/src/nfagraph/ng_cyclic_redundancy.cpp
index 9ae4458c..c8d34687 100644
--- a/src/nfagraph/ng_cyclic_redundancy.cpp
+++ b/src/nfagraph/ng_cyclic_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -62,9 +62,11 @@
 #include "ng_prune.h"
 #include "ng_util.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/graph_small_color_map.h"
 
+#include <algorithm>
 #include <boost/graph/depth_first_search.hpp>
 #include <boost/graph/reverse_graph.hpp>
 
@@ -123,17 +125,17 @@ class SearchVisitor : public boost::default_dfs_visitor {
 
 } // namespace
 
-template<class Graph>
+template<class Graph, class ColorMap>
 static
 bool searchForward(const Graph &g, const CharReach &reach,
+                   ColorMap &colours,
                    const flat_set<typename Graph::vertex_descriptor> &s,
                    typename Graph::vertex_descriptor w) {
-    map<NFAVertex, boost::default_color_type> colours;
+    colours.fill(small_color::white);
     try {
-        depth_first_visit(g, w, SearchVisitor(reach),
-                     make_assoc_property_map(colours),
-                     VertexInSet<typename Graph::vertex_descriptor, Graph>(s));
-    } catch (SearchFailed&) {
+        depth_first_visit(g, w, SearchVisitor(reach), colours,
+            VertexInSet<typename Graph::vertex_descriptor, Graph>(s));
+    } catch (SearchFailed &) {
         return false;
     }
 
@@ -162,6 +164,9 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
 
     typedef typename Graph::vertex_descriptor vertex_descriptor;
 
+    // Colour map used for depth_first_visit().
+    auto colours = make_small_color_map(g);
+
     // precalc successors of v.
     flat_set<vertex_descriptor> succ_v;
     insert(&succ_v, adjacent_vertices(v, g));
@@ -200,7 +205,7 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
 
             DEBUG_PRINTF("  - checking w %zu\n", g[w].index);
 
-            if (!searchForward(g, reach, s, w)) {
+            if (!searchForward(g, reach, colours, s, w)) {
                 continue;
             }
 
@@ -234,6 +239,8 @@ bool cyclicPathRedundancyPass(Graph &g, NGHolder &raw) {
 }
 
 bool removeCyclicPathRedundancy(NGHolder &g) {
+    assert(hasCorrectlyNumberedVertices(g));
+
     // Forward pass.
     bool f_changed = cyclicPathRedundancyPass(g, g);
     if (f_changed) {
diff --git a/src/nfagraph/ng_depth.cpp b/src/nfagraph/ng_depth.cpp
index 67a6b27b..6c90326c 100644
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -34,17 +34,18 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 
 #include <deque>
 #include <vector>
 
+#include <boost/graph/breadth_first_search.hpp>
 #include <boost/graph/dag_shortest_paths.hpp>
 #include <boost/graph/depth_first_search.hpp>
-#include <boost/graph/breadth_first_search.hpp>
 #include <boost/graph/filtered_graph.hpp>
+#include <boost/graph/property_maps/constant_property_map.hpp>
 #include <boost/graph/reverse_graph.hpp>
 #include <boost/graph/topological_sort.hpp>
-#include <boost/graph/property_maps/constant_property_map.hpp>
 #include <boost/range/adaptor/reversed.hpp>
 
 using namespace std;
@@ -137,13 +138,15 @@ vector<bool> findLoopReachable(const Graph &g,
     EdgeSet deadEdges;
     BackEdges<EdgeSet> be(deadEdges);
 
-    depth_first_search(g, visitor(be).root_vertex(src));
+    auto colors = make_small_color_map(g);
+
+    depth_first_search(g, be, colors, src);
     auto af = make_bad_edge_filter(&deadEdges);
     auto acyclic_g = make_filtered_graph(g, af);
 
     vector<Vertex> topoOrder; /* actually reverse topological order */
     topoOrder.reserve(deadNodes.size());
-    topological_sort(acyclic_g, back_inserter(topoOrder));
+    topological_sort(acyclic_g, back_inserter(topoOrder), color_map(colors));
 
     for (const auto &e : deadEdges) {
         size_t srcIdx = g[source(e, g)].index;
@@ -204,14 +207,16 @@ void calcDepthFromSource(const GraphT &g,
                          visitor(make_bfs_visitor(record_distances(
                              make_iterator_property_map(dMin.begin(),
                                                         min_index_map),
-                             boost::on_tree_edge()))));
+                             boost::on_tree_edge())))
+                         .color_map(make_small_color_map(mindist_g)));
 
     auto max_index_map = get(vertex_index, maxdist_g);
 
     dag_shortest_paths(maxdist_g, srcVertex,
                        distance_map(make_iterator_property_map(dMax.begin(),
                                                                max_index_map))
-                       .weight_map(make_constant_property<EdgeT>(-1)));
+                       .weight_map(make_constant_property<EdgeT>(-1))
+                       .color_map(make_small_color_map(maxdist_g)));
 
     for (size_t i = 0; i < numVerts; i++) {
         if (dMin[i] > DIST_UNREACHABLE) {
diff --git a/src/nfagraph/ng_dominators.cpp b/src/nfagraph/ng_dominators.cpp
index 50536b76..d6a064d1 100644
--- a/src/nfagraph/ng_dominators.cpp
+++ b/src/nfagraph/ng_dominators.cpp
@@ -36,7 +36,6 @@
 #include "ue2common.h"
 #include "ng_holder.h"
 #include "ng_util.h"
-#include "util/ue2_containers.h"
 
 #include <boost-patched/graph/dominator_tree.hpp> // locally patched version
 #include <boost-patched/graph/reverse_graph.hpp>
diff --git a/src/nfagraph/ng_dominators.h b/src/nfagraph/ng_dominators.h
index 81b7e037..f505b7e4 100644
--- a/src/nfagraph/ng_dominators.h
+++ b/src/nfagraph/ng_dominators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,15 +36,14 @@
 #define NG_DOMINATORS_H
 
 #include "ng_holder.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>
 
 namespace ue2 {
 
-class NGHolder;
+std::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g);
 
-ue2::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g);
-
-ue2::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g);
+std::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_dump.cpp b/src/nfagraph/ng_dump.cpp
index 094d2401..8777a750 100644
--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -51,6 +51,7 @@
 #include "smallwrite/smallwrite_dump.h"
 #include "util/bitutils.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 #include "util/report.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -175,7 +176,7 @@ public:
         : g(g_in), rm(&rm_in) {}
 
     NFAWriter(const GraphT &g_in,
-              const ue2::unordered_map<NFAVertex, u32> &region_map_in)
+              const unordered_map<NFAVertex, u32> &region_map_in)
         : g(g_in), region_map(&region_map_in) {}
 
     void operator()(ostream& os, const VertexT& v) const {
@@ -253,7 +254,7 @@ public:
 private:
     const GraphT &g;
     const ReportManager *rm = nullptr;
-    const ue2::unordered_map<NFAVertex, u32> *region_map = nullptr;
+    const unordered_map<NFAVertex, u32> *region_map = nullptr;
 };
 }
 
@@ -277,7 +278,7 @@ void dumpGraphImpl(const char *name, const GraphT &g, const ReportManager &rm) {
 
 template <typename GraphT>
 void dumpGraphImpl(const char *name, const GraphT &g,
-                   const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                   const unordered_map<NFAVertex, u32> &region_map) {
     typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
     typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
     ofstream os(name);
@@ -331,7 +332,7 @@ void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
 }
 
 void dumpHolderImpl(const NGHolder &h,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const unordered_map<NFAVertex, u32> &region_map,
                     unsigned int stageNumber, const char *stageName,
                     const Grey &grey) {
     if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
@@ -348,14 +349,7 @@ void dumpSmallWrite(const RoseEngine *rose, const Grey &grey) {
     }
 
     const struct SmallWriteEngine *smwr = getSmallWrite(rose);
-
-    stringstream ss;
-    ss << grey.dumpPath << "smallwrite.txt";
-
-    FILE *f = fopen(ss.str().c_str(), "w");
-    smwrDumpText(smwr, f);
-    fclose(f);
-
+    smwrDumpText(smwr, StdioFile(grey.dumpPath + "smallwrite.txt", "w"));
     smwrDumpNFA(smwr, false, grey.dumpPath);
 }
 
@@ -420,9 +414,7 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
         return;
     }
 
-    stringstream ss;
-    ss << grey.dumpPath << "internal_reports.txt";
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "internal_reports.txt", "w");
     const vector<Report> &reports = rm.reports();
     for (size_t i = 0; i < reports.size(); i++) {
         const Report &report = reports[i];
@@ -461,7 +453,6 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
         }
         fprintf(f, "\n");
     }
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_dump.h b/src/nfagraph/ng_dump.h
index 077f07ce..3e12d1d2 100644
--- a/src/nfagraph/ng_dump.h
+++ b/src/nfagraph/ng_dump.h
@@ -36,7 +36,8 @@
 #include "grey.h"
 #include "ng_holder.h" // for graph types
 #include "ue2common.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>
 
 #ifdef DUMP_SUPPORT
 #include <fstream>
@@ -75,7 +76,7 @@ void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
 
 // Variant that takes a region map as well.
 void dumpHolderImpl(const NGHolder &h,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const std::unordered_map<NFAVertex, u32> &region_map,
                     unsigned int stageNumber, const char *stageName,
                     const Grey &grey);
 
@@ -123,7 +124,7 @@ void dumpHolder(UNUSED const NGHolder &h, UNUSED unsigned int stageNumber,
 
 UNUSED static inline
 void dumpHolder(UNUSED const NGHolder &h,
-                UNUSED const ue2::unordered_map<NFAVertex, u32> &region_map,
+                UNUSED const std::unordered_map<NFAVertex, u32> &region_map,
                 UNUSED unsigned int stageNumber, UNUSED const char *name,
                 UNUSED const Grey &grey) {
 #ifdef DUMP_SUPPORT
diff --git a/src/nfagraph/ng_edge_redundancy.cpp b/src/nfagraph/ng_edge_redundancy.cpp
index 3ce62c41..b8354bd4 100644
--- a/src/nfagraph/ng_edge_redundancy.cpp
+++ b/src/nfagraph/ng_edge_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,8 +38,8 @@
 #include "parser/position.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
 
 #include <set>
 #include <vector>
@@ -181,6 +181,28 @@ bool removeEdgeRedundancyNearCyclesFwd(NGHolder &g, bool ignore_starts) {
     return dead_count;
 }
 
+static
+bool checkReportsRev(const NGHolder &g, NFAVertex v,
+                     const set<NFAVertex> &happy) {
+    if (g[v].reports.empty()) {
+        return true;
+    }
+
+    assert(edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second);
+
+    /* an edge to accept takes priority over eod only accept */
+    NFAVertex accept = edge(v, g.accept, g).second ? g.accept : g.acceptEod;
+
+    flat_set<ReportID> happy_reports;
+    for (NFAVertex u : happy) {
+        if (edge(u, accept, g).second) {
+            insert(&happy_reports, g[u].reports);
+        }
+    }
+
+    return is_subset_of(g[v].reports, happy_reports);
+}
+
 /** \brief Redundant self-loop removal (reverse version).
  *
  * A self loop on a vertex v can be removed if:
@@ -233,7 +255,8 @@ bool removeEdgeRedundancyNearCyclesRev(NGHolder &g) {
             happy.insert(u);
         }
 
-        if (!happy.empty() && checkVerticesRev(g, sad, happy)) {
+        if (!happy.empty() && checkVerticesRev(g, sad, happy)
+            && checkReportsRev(g, v, happy)) {
             dead_count++;
             remove_edge(v, v, g);
         }
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index 438e5ea8..a42a0ac7 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -37,9 +37,10 @@
 #include "ng_holder.h"
 #include "ng_util.h"
 #include "util/compile_context.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <memory>
@@ -121,16 +122,9 @@ public:
                vertex_flags == b.vertex_flags && rs == b.rs;
     }
 
-    friend size_t hash_value(const ClassInfo &c) {
-        size_t val = 0;
-        boost::hash_combine(val, c.rs);
-        boost::hash_combine(val, c.vertex_flags);
-        boost::hash_combine(val, c.cr);
-        boost::hash_combine(val, c.adjacent_cr);
-        boost::hash_combine(val, c.node_type);
-        boost::hash_combine(val, c.depth.d1);
-        boost::hash_combine(val, c.depth.d2);
-        return val;
+    size_t hash() const {
+        return hash_all(rs, vertex_flags, cr, adjacent_cr, node_type, depth.d1,
+                        depth.d2);
     }
 
 private:
@@ -319,7 +313,7 @@ vector<VertexInfoSet> partitionGraph(vector<unique_ptr<VertexInfo>> &infos,
     const size_t num_verts = infos.size();
 
     vector<VertexInfoSet> classes;
-    unordered_map<ClassInfo, unsigned> classinfomap;
+    ue2_unordered_map<ClassInfo, unsigned> classinfomap;
 
     // assume we will have lots of classes, so we don't waste time resizing
     // these structures.
diff --git a/src/nfagraph/ng_execute.h b/src/nfagraph/ng_execute.h
index bdcfecfd..32f5520d 100644
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 #define NG_EXECUTE_H
 
 #include "ng_holder.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <vector>
 
diff --git a/src/nfagraph/ng_fixed_width.cpp b/src/nfagraph/ng_fixed_width.cpp
index 978dad44..8fb264d8 100644
--- a/src/nfagraph/ng_fixed_width.cpp
+++ b/src/nfagraph/ng_fixed_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,7 @@ namespace ue2 {
 
 static
 bool findMask(const NGHolder &g, vector<CharReach> *mask, bool *anchored,
-              ue2::flat_set<ReportID> *reports) {
+              flat_set<ReportID> *reports) {
     DEBUG_PRINTF("looking for a mask pattern\n");
     set<NFAVertex> s_succ;
     insert(&s_succ, adjacent_vertices(g.start, g));
@@ -117,7 +117,7 @@ bool handleFixedWidth(RoseBuild &rose, const NGHolder &g, const Grey &grey) {
         return false;
     }
 
-    ue2::flat_set<ReportID> reports;
+    flat_set<ReportID> reports;
     bool anchored = false;
     vector<CharReach> mask;
 
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index e4be14c3..992faf7c 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,10 +40,12 @@
 #include "util/bitfield.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/hash_dynamic_bitset.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <functional>
@@ -236,7 +238,7 @@ public:
 
 struct Big_Traits {
     using StateSet = dynamic_bitset<>;
-    using StateMap = map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>;
 
     static StateSet init_states(u32 num) {
         return StateSet(num);
@@ -257,7 +259,7 @@ public:
 
 struct Graph_Traits {
     using StateSet = bitfield<NFA_STATE_LIMIT>;
-    using StateMap = ue2::unordered_map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t>;
 
     static StateSet init_states(UNUSED u32 num) {
         assert(num <= NFA_STATE_LIMIT);
@@ -284,8 +286,8 @@ public:
 
 class Automaton_Haig_Merge {
 public:
-    typedef vector<u16> StateSet;
-    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
+    using StateSet = vector<u16>;
+    using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
 
     explicit Automaton_Haig_Merge(const vector<const raw_som_dfa *> &in)
         : nfas(in.begin(), in.end()), dead(in.size()) {
@@ -514,11 +516,11 @@ bool doHaig(const NGHolder &g, som_type som,
             raw_som_dfa *rdfa) {
     u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
                                                      a fight */
-    typedef typename Auto::StateSet StateSet;
+    using StateSet = typename Auto::StateSet;
     vector<StateSet> nfa_state_map;
     Auto n(g, som, triggers, unordered_som);
     try {
-        if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
+        if (!determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return false;
         }
@@ -720,15 +722,14 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
         }
     }
 
-    typedef Automaton_Haig_Merge::StateSet StateSet;
+    using StateSet = Automaton_Haig_Merge::StateSet;
     vector<StateSet> nfa_state_map;
     auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
                                               NODE_START,
                                               dfas[0]->stream_som_loc_width);
 
-    int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
-    if (rv) {
-        DEBUG_PRINTF("%d:state limit (%u) exceeded\n", rv, limit);
+    if (!determinise(n, rdfa->states, limit, &nfa_state_map)) {
+        DEBUG_PRINTF("state limit (%u) exceeded\n", limit);
         return nullptr; /* over state limit */
     }
 
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index fbb6ac52..36cf6244 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,7 +40,7 @@
 #include "ue2common.h"
 #include "nfa/nfa_kind.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/ue2_graph.h"
 
 namespace ue2 {
@@ -67,7 +67,7 @@ struct NFAGraphEdgeProps {
 
     /** \brief For graphs that will be implemented as multi-top engines, this
      * specifies the top events. Only used on edges from the start vertex. */
-    ue2::flat_set<u32> tops;
+    flat_set<u32> tops;
 
     /** \brief Flags associated with assertions. */
     u32 assert_flags = 0;
diff --git a/src/nfagraph/ng_is_equal.cpp b/src/nfagraph/ng_is_equal.cpp
index 2df79f50..35a09d0e 100644
--- a/src/nfagraph/ng_is_equal.cpp
+++ b/src/nfagraph/ng_is_equal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,13 +39,9 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
-
-#include <set>
-
-#include <boost/functional/hash/hash.hpp>
 
 using namespace std;
 
@@ -200,11 +196,11 @@ u64a hash_holder(const NGHolder &g) {
     size_t rv = 0;
 
     for (auto v : vertices_range(g)) {
-        boost::hash_combine(rv, g[v].index);
-        boost::hash_combine(rv, g[v].char_reach);
+        hash_combine(rv, g[v].index);
+        hash_combine(rv, g[v].char_reach);
 
         for (auto w : adjacent_vertices_range(v, g)) {
-            boost::hash_combine(rv, g[w].index);
+            hash_combine(rv, g[w].index);
         }
     }
 
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index 9bf16efe..d8ba503c 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -346,24 +346,4 @@ bytecode_ptr<NFA> constructLBR(const NGHolder &g,
     return constructLBR(proto, triggers, cc, rm);
 }
 
-/** \brief True if graph \p g could be turned into an LBR engine. */
-bool isLBR(const NGHolder &g, const Grey &grey) {
-    if (!grey.allowLbr) {
-        return false;
-    }
-
-    PureRepeat repeat;
-    if (!isPureRepeat(g, repeat)) {
-        DEBUG_PRINTF("not pure bounded repeat\n");
-        return false;
-    }
-
-    if (repeat.reports.size() != 1) {
-        DEBUG_PRINTF("too many reports\n");
-        return false;
-    }
-
-    return true;
-}
-
 } // namespace ue2
diff --git a/src/nfagraph/ng_lbr.h b/src/nfagraph/ng_lbr.h
index 1eec9653..c181dbb9 100644
--- a/src/nfagraph/ng_lbr.h
+++ b/src/nfagraph/ng_lbr.h
@@ -66,9 +66,6 @@ constructLBR(const CastleProto &proto,
              const std::vector<std::vector<CharReach>> &triggers,
              const CompileContext &cc, const ReportManager &rm);
 
-/** \brief True if graph \p g could be turned into an LBR engine. */
-bool isLBR(const NGHolder &g, const Grey &grey);
-
 } // namespace ue2
 
 #endif // NG_LBR_H
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 283bba22..922100e7 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -53,11 +53,13 @@
 #include "util/container.h"
 #include "util/graph_range.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/verify_types.h"
 
 #include <algorithm>
 #include <map>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include <boost/range/adaptor/map.hpp>
@@ -73,8 +75,8 @@ namespace ue2 {
 // Only used in assertions.
 static
 bool sanityCheckGraph(const NGHolder &g,
-                      const ue2::unordered_map<NFAVertex, u32> &state_ids) {
-    ue2::unordered_set<u32> seen_states;
+                      const unordered_map<NFAVertex, u32> &state_ids) {
+    unordered_set<u32> seen_states;
 
     for (auto v : vertices_range(g)) {
         // Non-specials should have non-empty reachability.
@@ -115,10 +117,9 @@ bool sanityCheckGraph(const NGHolder &g,
 #endif
 
 static
-void findSquashStates(const NGHolder &g,
-                      const vector<BoundedRepeatData> &repeats,
-                      map<NFAVertex, NFAStateSet> &squashMap) {
-    squashMap = findSquashers(g);
+unordered_map<NFAVertex, NFAStateSet> findSquashStates(const NGHolder &g,
+                                    const vector<BoundedRepeatData> &repeats) {
+    auto squashMap = findSquashers(g);
     filterSquashers(g, squashMap);
 
     /* We also filter out the cyclic states representing bounded repeats, as
@@ -128,6 +129,8 @@ void findSquashStates(const NGHolder &g,
             squashMap.erase(br.cyclic);
         }
     }
+
+    return squashMap;
 }
 
 /**
@@ -468,7 +471,7 @@ void makeTopStates(NGHolder &g, map<u32, set<NFAVertex>> &tops_out,
 static
 set<NFAVertex> findZombies(const NGHolder &h,
             const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids,
+            const unordered_map<NFAVertex, u32> &state_ids,
             const CompileContext &cc) {
     set<NFAVertex> zombies;
     if (!cc.grey.allowZombies) {
@@ -516,7 +519,7 @@ set<NFAVertex> findZombies(const NGHolder &h,
 }
 
 static
-void reverseStateOrdering(ue2::unordered_map<NFAVertex, u32> &state_ids) {
+void reverseStateOrdering(unordered_map<NFAVertex, u32> &state_ids) {
     vector<NFAVertex> ordering;
     for (auto &e : state_ids) {
         if (e.second == NO_STATE) {
@@ -569,7 +572,7 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
              bool impl_test_only, const CompileContext &cc,
-             ue2::unordered_map<NFAVertex, u32> &state_ids,
+             unordered_map<NFAVertex, u32> &state_ids,
              vector<BoundedRepeatData> &repeats,
              map<u32, set<NFAVertex>> &tops) {
     assert(is_triggered(h_in) || fixed_depth_tops.empty());
@@ -637,7 +640,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
         assert(rm);
     }
 
-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
     map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
@@ -657,12 +660,12 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
         br_cyclic[br.cyclic] = BoundedRepeatSummary(br.repeatMin, br.repeatMax);
     }
 
-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;
 
     // build map of squashed and squashers
     if (cc.grey.squashNFA) {
-        findSquashStates(*h, repeats, squashMap);
+        squashMap = findSquashStates(*h, repeats);
 
         if (rm && cc.grey.highlanderSquash) {
             reportSquashMap = findHighlanderSquashers(*h, *rm);
@@ -734,8 +737,8 @@ bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
     map<u32, set<NFAVertex>> tops; /* only the standards tops for nfas */
     set<NFAVertex> zombies;
     vector<BoundedRepeatData> repeats;
-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;
 
     return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
                     zombies, false, false, hint, cc);
@@ -785,7 +788,7 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
      * resultant NGHolder has <= NFA_MAX_STATES. If it does, we know we can
      * implement it as an NFA. */
 
-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
     map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
@@ -832,7 +835,7 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
     const map<u32, u32> fixed_depth_tops; // empty
     const map<u32, vector<vector<CharReach>>> triggers; // empty
 
-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
     map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
@@ -848,8 +851,8 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
 
     // Should have no bearing on accel calculation, so we leave these empty.
     const set<NFAVertex> zombies;
-    const map<NFAVertex, NFAStateSet> reportSquashMap;
-    const map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;
 
     return countAccelStates(*h, state_ids, repeats, reportSquashMap, squashMap,
                             tops, zombies, cc);
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 80e08a7f..fa46a42c 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -335,7 +335,7 @@ struct DAccelScheme {
         return false;
     }
 
-    ue2::flat_set<std::pair<u8, u8> > double_byte;
+    flat_set<pair<u8, u8>> double_byte;
     CharReach double_cr;
     u32 double_offset = 0;
 };
diff --git a/src/nfagraph/ng_limex_accel.h b/src/nfagraph/ng_limex_accel.h
index f0c98db2..f6f7f1b3 100644
--- a/src/nfagraph/ng_limex_accel.h
+++ b/src/nfagraph/ng_limex_accel.h
@@ -39,8 +39,8 @@
 #include "nfa/accelcompile.h"
 #include "util/accel_scheme.h"
 #include "util/charreach.h"
+#include "util/flat_containers.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"
 
 #include <map>
 #include <vector>
@@ -57,8 +57,7 @@ struct CompileContext;
 
 void findAccelFriends(const NGHolder &g, NFAVertex v,
                   const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-                      u32 offset,
-                      ue2::flat_set<NFAVertex> *friends);
+                  u32 offset, flat_set<NFAVertex> *friends);
 
 #define DOUBLE_SHUFTI_LIMIT 20
 
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index a6664b07..ea0def02 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -40,6 +40,7 @@
 #include "util/depth.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/ue2_graph.h"
 #include "util/ue2string.h"
 
@@ -462,17 +463,13 @@ next_literal:
 
 #ifdef DEBUG
 static UNUSED
-const char *describeColor(boost::default_color_type c) {
+const char *describeColor(small_color c) {
     switch (c) {
-    case boost::white_color:
+    case small_color::white:
         return "white";
-    case boost::gray_color:
+    case small_color::gray:
         return "gray";
-    case boost::green_color:
-        return "green";
-    case boost::red_color:
-        return "red";
-    case boost::black_color:
+    case small_color::black:
         return "black";
     default:
         return "unknown";
@@ -488,12 +485,14 @@ const char *describeColor(boost::default_color_type c) {
  */
 static
 vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
+    const size_t edge_count = num_edges(lg);
     vector<LitEdge> fwd_edges;
+    fwd_edges.reserve(edge_count);
     for (const auto &e : edges_range(lg)) {
         fwd_edges.push_back(e);
     }
 
-    vector<LitEdge> rev_map(2 * num_edges(lg));
+    vector<LitEdge> rev_map(2 * edge_count);
 
     for (const auto &e : fwd_edges) {
         LitVertex u = source(e, lg);
@@ -525,7 +524,7 @@ void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
     const auto v_index_map = get(&LitGraphVertexProps::index, lg);
     const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
     const size_t num_verts = num_vertices(lg);
-    vector<boost::default_color_type> colors(num_verts);
+    auto colors = make_small_color_map(lg);
     vector<s32> distances(num_verts);
     vector<LitEdge> predecessors(num_verts);
     vector<u64a> residuals(num_edges(lg));
@@ -535,7 +534,7 @@ void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
             make_iterator_property_map(residuals.begin(), e_index_map),
             make_iterator_property_map(rev_edges.begin(), e_index_map),
             make_iterator_property_map(predecessors.begin(), v_index_map),
-            make_iterator_property_map(colors.begin(), v_index_map),
+            colors,
             make_iterator_property_map(distances.begin(), v_index_map),
             v_index_map, lg.root, lg.sink);
     DEBUG_PRINTF("done, flow = %llu\n", flow);
@@ -550,19 +549,19 @@ void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
 
     for (const auto &e : edges_range(lg)) {
         const LitVertex u = source(e, lg), v = target(e, lg);
-        const auto ucolor = colors[lg[u].index];
-        const auto vcolor = colors[lg[v].index];
+        const auto ucolor = get(colors, u);
+        const auto vcolor = get(colors, v);
 
         DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n", lg[u].index,
                      describeColor(ucolor), lg[v].index, describeColor(vcolor),
                      lg[e].score);
 
-        if (ucolor != boost::white_color && vcolor == boost::white_color) {
+        if (ucolor != small_color::white && vcolor == small_color::white) {
             assert(v != lg.sink);
             white_cut.push_back(e);
             white_flow += lg[e].score;
         }
-        if (ucolor == boost::black_color && vcolor != boost::black_color) {
+        if (ucolor == small_color::black && vcolor != small_color::black) {
             assert(v != lg.sink);
             black_cut.push_back(e);
             black_flow += lg[e].score;
@@ -812,7 +811,7 @@ bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
     }
     assert(u != g.startDs);
 
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
     vector<NFAVertex> pivots = make_vector_from(adjacent_vertices(u, g));
     splitRHS(g, pivots, rhs, &rhs_map);
 
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index de05e490..4d3965df 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -45,6 +45,8 @@
 #include "util/graph_range.h"
 #include "util/ue2string.h"
 
+#include <unordered_set>
+
 using namespace std;
 
 namespace ue2 {
@@ -196,7 +198,7 @@ bool splitOffLiterals(NG &ng, NGHolder &g) {
     bool changed = false;
     set<NFAVertex> dead;
 
-    ue2::unordered_set<NFAVertex> unanchored; // for faster lookup.
+    unordered_set<NFAVertex> unanchored; // for faster lookup.
     insert(&unanchored, adjacent_vertices(g.startDs, g));
 
     // Anchored literals.
diff --git a/src/nfagraph/ng_literal_decorated.cpp b/src/nfagraph/ng_literal_decorated.cpp
index 89c01a6c..61a31dbf 100644
--- a/src/nfagraph/ng_literal_decorated.cpp
+++ b/src/nfagraph/ng_literal_decorated.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,8 +45,6 @@
 #include <memory>
 #include <sstream>
 
-#include <boost/graph/depth_first_search.hpp>
-
 using namespace std;
 
 namespace ue2 {
@@ -194,7 +192,7 @@ struct PathMask {
     }
 
     vector<CharReach> mask;
-    ue2::flat_set<ReportID> reports;
+    flat_set<ReportID> reports;
     bool is_anchored;
     bool is_eod;
 };
@@ -210,6 +208,11 @@ bool handleDecoratedLiterals(RoseBuild &rose, const NGHolder &g,
         return false;
     }
 
+    if (!hasNarrowReachVertex(g)) {
+        DEBUG_PRINTF("no narrow reach vertices\n");
+        return false;
+    }
+
     if (hasLargeDegreeVertex(g)) {
         DEBUG_PRINTF("large degree\n");
         return false;
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 9448a0bf..091b89b8 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -41,17 +41,18 @@
 #include "ue2common.h"
 #include "util/bitfield.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/hash.h"
 #include "util/hash_dynamic_bitset.h"
 #include "util/make_unique.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <functional>
 #include <map>
 #include <set>
+#include <unordered_map>
 #include <vector>
 
 #include <boost/dynamic_bitset.hpp>
@@ -283,10 +284,8 @@ static
 bool triggerAllowed(const NGHolder &g, const NFAVertex v,
                     const vector<vector<CharReach> > &all_triggers,
                     const vector<CharReach> &trigger) {
-    set<NFAVertex> curr;
-    set<NFAVertex> next;
-
-    curr.insert(v);
+    flat_set<NFAVertex> curr({v});
+    flat_set<NFAVertex> next;
 
     for (auto it = trigger.rbegin(); it != trigger.rend(); ++it) {
         next.clear();
@@ -433,6 +432,7 @@ public:
         }
         return allExternalReports(*rm, test_reports);
     }
+
 private:
     const ReportManager *rm;
 public:
@@ -484,7 +484,7 @@ public:
 
 struct Graph_Traits {
     using StateSet = bitfield<NFA_STATE_LIMIT>;
-    using StateMap = ue2::unordered_map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t>;
 
     static StateSet init_states(UNUSED u32 num) {
         assert(num <= NFA_STATE_LIMIT);
@@ -559,16 +559,21 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         = (graph.kind == NFA_OUTFIX || finalChance) ? FINAL_DFA_STATE_LIMIT
                                                     : DFA_STATE_LIMIT;
 
-    unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(graph.kind);
-
     const u32 numStates = num_vertices(graph);
     DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
 
+    if (numStates > FINAL_DFA_STATE_LIMIT) {
+        DEBUG_PRINTF("rejecting nfa as too many vertices\n");
+        return nullptr;
+    }
+
+    auto rdfa = ue2::make_unique<raw_dfa>(graph.kind);
+
     if (numStates <= NFA_STATE_LIMIT) {
         /* Fast path. Automaton_Graph uses a bitfield internally to represent
          * states and is quicker than Automaton_Big. */
         Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
-        if (determinise(n, rdfa->states, state_limit)) {
+        if (!determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
         }
@@ -580,7 +585,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
     } else {
         /* Slow path. Too many states to use Automaton_Graph. */
         Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
-        if (determinise(n, rdfa->states, state_limit)) {
+        if (!determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
         }
diff --git a/src/nfagraph/ng_mcclellan_internal.h b/src/nfagraph/ng_mcclellan_internal.h
index b78dac3b..f069d733 100644
--- a/src/nfagraph/ng_mcclellan_internal.h
+++ b/src/nfagraph/ng_mcclellan_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@
 #include "nfagraph/ng_holder.h"
 #include "util/charreach.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <boost/dynamic_bitset.hpp>
 
diff --git a/src/nfagraph/ng_misc_opt.cpp b/src/nfagraph/ng_misc_opt.cpp
index 29939fec..8aaaf99f 100644
--- a/src/nfagraph/ng_misc_opt.cpp
+++ b/src/nfagraph/ng_misc_opt.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -69,9 +69,11 @@
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/graph_small_color_map.h"
+#include "util/flat_containers.h"
 #include "ue2common.h"
 
+#include <boost/dynamic_bitset.hpp>
 #include <boost/graph/depth_first_search.hpp>
 #include <boost/graph/filtered_graph.hpp>
 
@@ -549,14 +551,29 @@ bool mergeCyclicDotStars(NGHolder &g) {
     return true;
 }
 
+struct PrunePathsInfo {
+    explicit PrunePathsInfo(const NGHolder &g)
+        : color_map(make_small_color_map(g)), bad(num_vertices(g)) {}
+
+    void clear() {
+        no_explore.clear();
+        color_map.fill(small_color::white);
+        bad.reset();
+    }
+
+    flat_set<NFAEdge> no_explore;
+    using color_map_type = decltype(make_small_color_map(NGHolder()));
+    color_map_type color_map;
+    boost::dynamic_bitset<> bad;
+};
+
 /**
- * Returns the set of vertices that cannot be on if v is not on.
+ * Finds the set of vertices that cannot be on if v is not on, setting their
+ * indices in bitset PrunePathsInfo::bad.
  */
 static
-flat_set<NFAVertex> findDependentVertices(const NGHolder &g, NFAVertex v) {
-    auto v_pred = preds(v, g);
-    flat_set<NFAVertex> may_be_on;
-
+void findDependentVertices(const NGHolder &g, PrunePathsInfo &info,
+                           NFAVertex v) {
     /* We need to exclude any vertex that may be reached on a path which is
      * incompatible with the vertex v being on. */
 
@@ -570,38 +587,28 @@ flat_set<NFAVertex> findDependentVertices(const NGHolder &g, NFAVertex v) {
      * check down edges. Alternately can just filter these edges out of the
      * graph first.
      */
-    flat_set<NFAEdge> no_explore;
     for (NFAVertex t : adjacent_vertices_range(v, g)) {
         for (NFAEdge e : in_edges_range(t, g)) {
             NFAVertex s = source(e, g);
             if (edge(s, v, g).second) {
-                no_explore.insert(e);
+                info.no_explore.insert(e);
             }
         }
     }
 
-    auto filtered_g = make_filtered_graph(g, make_bad_edge_filter(&no_explore));
+    auto filtered_g =
+        make_filtered_graph(g, make_bad_edge_filter(&info.no_explore));
+
+    // We use a bitset to track bad vertices, rather than filling a (potentially
+    // very large) set structure.
+    auto recorder = make_vertex_index_bitset_recorder(info.bad);
 
-    vector<boost::default_color_type> color_raw(num_vertices(g));
-    auto color = make_iterator_property_map(color_raw.begin(),
-                                            get(vertex_index, g));
-    flat_set<NFAVertex> bad;
     for (NFAVertex b : vertices_range(g)) {
         if (b != g.start && g[b].char_reach.isSubsetOf(g[v].char_reach)) {
             continue;
         }
-        boost::depth_first_visit(filtered_g, b, make_vertex_recorder(bad),
-                                 color);
+        boost::depth_first_visit(filtered_g, b, recorder, info.color_map);
     }
-
-    flat_set<NFAVertex> rv;
-    for (NFAVertex u : vertices_range(g)) {
-        if (!contains(bad, u)) {
-            DEBUG_PRINTF("%zu is good\n", g[u].index);
-            rv.insert(u);
-        }
-    }
-    return rv;
 }
 
 static
@@ -617,14 +624,16 @@ bool sometimesEnabledConcurrently(NFAVertex main_cyclic, NFAVertex v,
 }
 
 static
-bool pruneUsingSuccessors(NGHolder &g, NFAVertex u, som_type som) {
+bool pruneUsingSuccessors(NGHolder &g, PrunePathsInfo &info, NFAVertex u,
+                          som_type som) {
     if (som && (is_virtual_start(u, g) || u == g.startDs)) {
         return false;
     }
 
     bool changed = false;
     DEBUG_PRINTF("using cyclic %zu as base\n", g[u].index);
-    auto children = findDependentVertices(g, u);
+    info.clear();
+    findDependentVertices(g, info, u);
     vector<NFAVertex> u_succs;
     for (NFAVertex v : adjacent_vertices_range(u, g)) {
         if (som && is_virtual_start(v, g)) {
@@ -634,22 +643,25 @@ bool pruneUsingSuccessors(NGHolder &g, NFAVertex u, som_type som) {
         }
         u_succs.push_back(v);
     }
+
     stable_sort(u_succs.begin(), u_succs.end(),
          [&](NFAVertex a, NFAVertex b) {
              return g[a].char_reach.count() > g[b].char_reach.count();
          });
+
+    flat_set<NFAEdge> dead;
+
     for (NFAVertex v : u_succs) {
         DEBUG_PRINTF("    using %zu as killer\n", g[v].index);
         /* Need to distinguish between vertices that are switched on after the
          * cyclic vs vertices that are switched on concurrently with the cyclic
          * if (subject to a suitable reach) */
         bool v_peer_of_cyclic = willBeEnabledConcurrently(u, v, g);
-        set<NFAEdge> dead;
         for (NFAVertex s : adjacent_vertices_range(v, g)) {
             DEBUG_PRINTF("        looking at preds of %zu\n", g[s].index);
             for (NFAEdge e : in_edges_range(s, g)) {
                 NFAVertex p = source(e, g);
-                if (!contains(children, p) || p == v || p == u
+                if (info.bad.test(g[p].index) || p == v || p == u
                     || p == g.accept) {
                     DEBUG_PRINTF("%zu not a cand\n", g[p].index);
                     continue;
@@ -687,6 +699,7 @@ bool pruneUsingSuccessors(NGHolder &g, NFAVertex u, som_type som) {
             }
         }
         remove_edges(dead, g);
+        dead.clear();
     }
 
     DEBUG_PRINTF("changed %d\n", (int)changed);
@@ -696,9 +709,11 @@ bool pruneUsingSuccessors(NGHolder &g, NFAVertex u, som_type som) {
 bool prunePathsRedundantWithSuccessorOfCyclics(NGHolder &g, som_type som) {
     /* TODO: the reverse form of this is also possible */
     bool changed = false;
+    PrunePathsInfo info(g);
+
     for (NFAVertex v : vertices_range(g)) {
         if (hasSelfLoop(v, g) && g[v].char_reach.all()) {
-            changed |= pruneUsingSuccessors(g, v, som);
+            changed |= pruneUsingSuccessors(g, info, v, som);
         }
     }
 
diff --git a/src/nfagraph/ng_netflow.cpp b/src/nfagraph/ng_netflow.cpp
index cff26358..780a319f 100644
--- a/src/nfagraph/ng_netflow.cpp
+++ b/src/nfagraph/ng_netflow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 #include "ue2common.h"
 #include "util/container.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 
 #include <algorithm>
 #include <boost/graph/boykov_kolmogorov_max_flow.hpp>
@@ -118,7 +119,7 @@ void removeEdgesFromIndex(NGHolder &g, vector<u64a> &capacityMap, u32 idx) {
  * colour map (from which we can find the min cut). */
 static
 u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
-                vector<default_color_type> &colorMap) {
+                decltype(make_small_color_map(NGHolder())) &colorMap) {
     vector<u64a> capacityMap = capacityMap_in;
     NFAVertex src = h.start;
     NFAVertex sink = h.acceptEod;
@@ -141,7 +142,6 @@ u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
     vector<u64a> edgeResiduals(numTotalEdges);
     vector<NFAEdge> predecessors(numVertices);
     vector<s32> distances(numVertices);
-    assert(colorMap.size() == numVertices);
 
     auto v_index_map = get(vertex_index, h);
     auto e_index_map = get(edge_index, h);
@@ -151,7 +151,7 @@ u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
          make_iterator_property_map(edgeResiduals.begin(), e_index_map),
          make_iterator_property_map(reverseEdges.begin(), e_index_map),
          make_iterator_property_map(predecessors.begin(), v_index_map),
-         make_iterator_property_map(colorMap.begin(), v_index_map),
+         colorMap,
          make_iterator_property_map(distances.begin(), v_index_map),
          v_index_map,
          src, sink);
@@ -169,8 +169,8 @@ vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
     assert(hasCorrectlyNumberedEdges(h));
     assert(hasCorrectlyNumberedVertices(h));
 
-    vector<default_color_type> colorMap(num_vertices(h));
-    u64a flow = getMaxFlow(h, scores, colorMap);
+    auto colors = make_small_color_map(h);
+    u64a flow = getMaxFlow(h, scores, colors);
 
     vector<NFAEdge> picked_white;
     vector<NFAEdge> picked_black;
@@ -185,17 +185,17 @@ vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
             continue; // skips, among other things, reverse edges
         }
 
-        default_color_type fromColor = colorMap[h[from].index];
-        default_color_type toColor = colorMap[h[to].index];
+        auto fromColor = get(colors, from);
+        auto toColor = get(colors, to);
 
-        if (fromColor != boost::white_color && toColor == boost::white_color) {
+        if (fromColor != small_color::white && toColor == small_color::white) {
             assert(ec <= INVALID_EDGE_CAP);
             DEBUG_PRINTF("found white cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
             observed_white_flow += ec;
             picked_white.push_back(e);
         }
-        if (fromColor == boost::black_color && toColor != boost::black_color) {
+        if (fromColor == small_color::black && toColor != small_color::black) {
             assert(ec <= INVALID_EDGE_CAP);
             DEBUG_PRINTF("found black cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
@@ -206,7 +206,7 @@ vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
 
     DEBUG_PRINTF("min flow = %llu b flow = %llu w flow %llu\n", flow,
                  observed_black_flow, observed_white_flow);
-    if (MIN(observed_white_flow, observed_black_flow) != flow) {
+    if (min(observed_white_flow, observed_black_flow) != flow) {
         DEBUG_PRINTF("bad cut\n");
     }
 
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index 3cd9d06d..04611872 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -55,10 +55,11 @@
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
-#include "util/ue2_containers.h"
 #include "util/graph_range.h"
 
 #include <queue>
+#include <unordered_map>
+#include <unordered_set>
 
 #include <boost/range/adaptor/map.hpp>
 
@@ -127,10 +128,10 @@ struct RegionInfoQueueComp {
 
 static
 void findWidths(const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &region_map,
+                const unordered_map<NFAVertex, u32> &region_map,
                 RegionInfo &ri) {
     NGHolder rg;
-    ue2::unordered_map<NFAVertex, NFAVertex> mapping;
+    unordered_map<NFAVertex, NFAVertex> mapping;
     fillHolder(&rg, g, ri.vertices, &mapping);
 
     // Wire our entries to start and our exits to accept.
@@ -155,7 +156,7 @@ void findWidths(const NGHolder &g,
 // acc can be either h.accept or h.acceptEod.
 static
 void markBoundaryRegions(const NGHolder &h,
-                         const ue2::unordered_map<NFAVertex, u32> &region_map,
+                         const unordered_map<NFAVertex, u32> &region_map,
                          map<u32, RegionInfo> &regions, NFAVertex acc) {
     for (auto v : inv_adjacent_vertices_range(acc, h)) {
         if (is_special(v, h)) {
@@ -174,7 +175,7 @@ void markBoundaryRegions(const NGHolder &h,
 
 static
 map<u32, RegionInfo> findRegionInfo(const NGHolder &h,
-               const ue2::unordered_map<NFAVertex, u32> &region_map) {
+               const unordered_map<NFAVertex, u32> &region_map) {
     map<u32, RegionInfo> regions;
     for (auto v : vertices_range(h)) {
         if (is_special(v, h)) {
@@ -213,27 +214,17 @@ map<u32, RegionInfo> findRegionInfo(const NGHolder &h,
 }
 
 static
-void copyInEdges(NGHolder &g, NFAVertex from, NFAVertex to,
-                 const ue2::unordered_set<NFAVertex> &rverts) {
+void copyInEdges(NGHolder &g, NFAVertex from, NFAVertex to) {
     for (const auto &e : in_edges_range(from, g)) {
         NFAVertex u = source(e, g);
-        if (contains(rverts, u)) {
-            continue;
-        }
-
         add_edge_if_not_present(u, to, g[e], g);
     }
 }
 
 static
-void copyOutEdges(NGHolder &g, NFAVertex from, NFAVertex to,
-                  const ue2::unordered_set<NFAVertex> &rverts) {
+void copyOutEdges(NGHolder &g, NFAVertex from, NFAVertex to) {
     for (const auto &e : out_edges_range(from, g)) {
         NFAVertex t = target(e, g);
-        if (contains(rverts, t)) {
-            continue;
-        }
-
         add_edge_if_not_present(to, t, g[e], g);
 
         if (is_any_accept(t, g)) {
@@ -243,6 +234,21 @@ void copyOutEdges(NGHolder &g, NFAVertex from, NFAVertex to,
     }
 }
 
+static
+void removeInteriorEdges(NGHolder &g, const RegionInfo &ri) {
+    // Set of vertices in region, for quick lookups.
+    const unordered_set<NFAVertex> rverts(ri.vertices.begin(),
+                                          ri.vertices.end());
+
+    auto is_interior_in_edge = [&](const NFAEdge &e) {
+        return contains(rverts, source(e, g));
+    };
+
+    for (auto v : ri.vertices) {
+        remove_in_edge_if(v, is_interior_in_edge, g);
+    }
+}
+
 static
 void replaceRegion(NGHolder &g, const RegionInfo &ri,
                    size_t *verticesAdded, size_t *verticesRemoved) {
@@ -284,19 +290,17 @@ void replaceRegion(NGHolder &g, const RegionInfo &ri,
         add_edge(verts.back(), verts.back(), g);
     }
 
-    // Set of vertices in region, for quick lookups.
-    const ue2::unordered_set<NFAVertex> rverts(ri.vertices.begin(),
-                                               ri.vertices.end());
+    removeInteriorEdges(g, ri);
 
     for (size_t i = 0; i < replacementSize; i++) {
         NFAVertex v_new = verts[i];
 
         for (auto v_old : ri.vertices) {
             if (i == 0) {
-                copyInEdges(g, v_old, v_new, rverts);
+                copyInEdges(g, v_old, v_new);
             }
             if (i + 1 >= ri.minWidth) {
-                copyOutEdges(g, v_old, v_new, rverts);
+                copyOutEdges(g, v_old, v_new);
             }
         }
     }
diff --git a/src/nfagraph/ng_prune.cpp b/src/nfagraph/ng_prune.cpp
index 88f1880f..adda7031 100644
--- a/src/nfagraph/ng_prune.cpp
+++ b/src/nfagraph/ng_prune.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,7 @@
 #include "util/container.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/report_manager.h"
 
 #include <deque>
@@ -105,23 +106,18 @@ template<class nfag_t>
 static
 bool pruneForwardUseless(NGHolder &h, const nfag_t &g,
                          typename nfag_t::vertex_descriptor s,
-                         vector<default_color_type> &vertexColor) {
+                         decltype(make_small_color_map(NGHolder())) &colors) {
     // Begin with all vertices set to white, as DFV only marks visited
     // vertices.
-    fill(vertexColor.begin(), vertexColor.end(), boost::white_color);
+    colors.fill(small_color::white);
 
-    auto index_map = get(&NFAGraphVertexProps::index, g);
-
-    depth_first_visit(g, s, make_dfs_visitor(boost::null_visitor()),
-                      make_iterator_property_map(vertexColor.begin(),
-                                                 index_map));
+    depth_first_visit(g, s, make_dfs_visitor(boost::null_visitor()), colors);
 
     vector<NFAVertex> dead;
 
     // All non-special vertices that are still white can be removed.
     for (auto v : vertices_range(g)) {
-        u32 idx = g[v].index;
-        if (!is_special(v, g) && vertexColor[idx] == boost::white_color) {
+        if (!is_special(v, g) && get(colors, v) == small_color::white) {
             DEBUG_PRINTF("vertex %zu is unreachable from %zu\n",
                          g[v].index, g[s].index);
             dead.push_back(NFAVertex(v));
@@ -143,11 +139,11 @@ bool pruneForwardUseless(NGHolder &h, const nfag_t &g,
 void pruneUseless(NGHolder &g, bool renumber) {
     DEBUG_PRINTF("pruning useless vertices\n");
     assert(hasCorrectlyNumberedVertices(g));
-    vector<default_color_type> vertexColor(num_vertices(g));
+    auto colors = make_small_color_map(g);
 
-    bool work_done = pruneForwardUseless(g, g, g.start, vertexColor);
+    bool work_done = pruneForwardUseless(g, g, g.start, colors);
     work_done |= pruneForwardUseless(g, reverse_graph<NGHolder, NGHolder &>(g),
-                                     g.acceptEod, vertexColor);
+                                     g.acceptEod, colors);
 
     if (!work_done) {
         return;
@@ -227,7 +223,7 @@ void pruneHighlanderAccepts(NGHolder &g, const ReportManager &rm) {
 
 static
 bool isDominatedByReporter(const NGHolder &g,
-                           const ue2::unordered_map<NFAVertex, NFAVertex> &dom,
+                           const unordered_map<NFAVertex, NFAVertex> &dom,
                            NFAVertex v, ReportID report_id) {
     for (auto it = dom.find(v); it != end(dom); it = dom.find(v)) {
         NFAVertex u = it->second;
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index 76bc93da..06b9daee 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,8 +78,8 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <cassert>
@@ -747,7 +747,7 @@ u32 findCyclic(const NGHolder &g, vector<bool> &cyclic) {
 static
 void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
                    set<NFAEdge> &dead, som_type som) {
-    ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
+    auto dominators = findDominators(g);
 
     for (auto v : vertices_range(g)) {
         if (is_special(v, g)) {
@@ -791,8 +791,7 @@ void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
 static
 void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
                        set<NFAEdge> &dead) {
-    ue2::unordered_map<NFAVertex, NFAVertex> postdominators =
-        findPostDominators(g);
+    auto postdominators = findPostDominators(g);
 
     for (auto v : vertices_range(g)) {
         if (is_special(v, g)) {
diff --git a/src/nfagraph/ng_region.cpp b/src/nfagraph/ng_region.cpp
index 91904b46..2675be64 100644
--- a/src/nfagraph/ng_region.cpp
+++ b/src/nfagraph/ng_region.cpp
@@ -56,8 +56,9 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 
 #include <set>
 #include <utility>
@@ -407,19 +408,20 @@ void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
     }
 }
 
+using ColorMap = decltype(make_small_color_map(NGHolder()));
+
 /** Build a reverse topo ordering (with only the specials that are in use). We
  * also want to ensure vertices which only lead to back edges are placed near
  * their parents. */
 static
 vector<NFAVertex> buildTopoOrder(const NGHolder &w,
                                  const AcyclicGraph &acyclic_g,
-                                 vector<boost::default_color_type> &colours) {
+                                 ColorMap &colours) {
     vector<NFAVertex> topoOrder;
     topoOrder.reserve(num_vertices(w));
 
     topological_sort(acyclic_g, back_inserter(topoOrder),
-                     color_map(make_iterator_property_map(colours.begin(),
-                                             get(vertex_index, acyclic_g))));
+                     color_map(colours));
 
     reorderSpecials(w, acyclic_g, topoOrder);
 
@@ -443,15 +445,14 @@ unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
     const u32 numVertices = num_vertices(g);
     DEBUG_PRINTF("assigning regions for %u vertices in holder\n", numVertices);
 
-    vector<boost::default_color_type> colours(numVertices);
+    auto colours = make_small_color_map(g);
 
     // Build an acyclic graph for this NGHolder.
     BackEdgeSet deadEdges;
     depth_first_search(g,
                        visitor(BackEdges<BackEdgeSet>(deadEdges))
                        .root_vertex(g.start)
-                       .color_map(make_iterator_property_map(colours.begin(),
-                                          get(vertex_index, g))));
+                       .color_map(colours));
 
     auto af = make_bad_edge_filter(&deadEdges);
     AcyclicGraph acyclic_g(g, af);
diff --git a/src/nfagraph/ng_region.h b/src/nfagraph/ng_region.h
index a56933dc..a4708a58 100644
--- a/src/nfagraph/ng_region.h
+++ b/src/nfagraph/ng_region.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,19 +36,19 @@
 #include "ng_holder.h"
 #include "util/container.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
 
+#include <unordered_map>
 #include <vector>
 
 namespace ue2 {
 
 /** \brief Assign a region ID to every vertex in the graph. */
-ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g);
+std::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g);
 
 /** \brief True if vertices \p a and \p b are in the same region. */
 template <class Graph>
 bool inSameRegion(const Graph &g, NFAVertex a, NFAVertex b,
-                  const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                  const std::unordered_map<NFAVertex, u32> &region_map) {
     assert(contains(region_map, a) && contains(region_map, b));
 
     return region_map.at(a) == region_map.at(b) &&
@@ -58,7 +58,7 @@ bool inSameRegion(const Graph &g, NFAVertex a, NFAVertex b,
 /** \brief True if vertex \p b is in a later region than vertex \p a. */
 template <class Graph>
 bool inLaterRegion(const Graph &g, NFAVertex a, NFAVertex b,
-                   const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                   const std::unordered_map<NFAVertex, u32> &region_map) {
     assert(contains(region_map, a) && contains(region_map, b));
 
     u32 aa = g[a].index;
@@ -85,7 +85,7 @@ bool inLaterRegion(const Graph &g, NFAVertex a, NFAVertex b,
 /** \brief True if vertex \p b is in an earlier region than vertex \p a. */
 template <class Graph>
 bool inEarlierRegion(const Graph &g, NFAVertex a, NFAVertex b,
-                     const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                     const std::unordered_map<NFAVertex, u32> &region_map) {
     assert(contains(region_map, a) && contains(region_map, b));
 
     u32 aa = g[a].index;
@@ -112,7 +112,7 @@ bool inEarlierRegion(const Graph &g, NFAVertex a, NFAVertex b,
 /** \brief True if vertex \p v is an entry vertex for its region. */
 template <class Graph>
 bool isRegionEntry(const Graph &g, NFAVertex v,
-                   const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                   const std::unordered_map<NFAVertex, u32> &region_map) {
     // Note that some graph types do not have inv_adjacent_vertices, so we must
     // use in_edges here.
     for (const auto &e : in_edges_range(v, g)) {
@@ -127,7 +127,7 @@ bool isRegionEntry(const Graph &g, NFAVertex v,
 /** \brief True if vertex \p v is an exit vertex for its region. */
 template <class Graph>
 bool isRegionExit(const Graph &g, NFAVertex v,
-                  const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                  const std::unordered_map<NFAVertex, u32> &region_map) {
     for (auto w : adjacent_vertices_range(v, g)) {
         if (!inSameRegion(g, v, w, region_map)) {
             return true;
@@ -140,7 +140,7 @@ bool isRegionExit(const Graph &g, NFAVertex v,
 /** \brief True if vertex \p v is in a region all on its own. */
 template <class Graph>
 bool isSingletonRegion(const Graph &g, NFAVertex v,
-                       const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                       const std::unordered_map<NFAVertex, u32> &region_map) {
     for (const auto &e : in_edges_range(v, g)) {
         auto u = source(e, g);
         if (u != v && inSameRegion(g, v, u, region_map)) {
@@ -178,7 +178,7 @@ bool isSingletonRegion(const Graph &g, NFAVertex v,
  */
 template <class Graph>
 bool isOptionalRegion(const Graph &g, NFAVertex v,
-                      const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                      const std::unordered_map<NFAVertex, u32> &region_map) {
     assert(isRegionEntry(g, v, region_map));
 
     DEBUG_PRINTF("check if r%u is optional (inspecting v%zu)\n",
diff --git a/src/nfagraph/ng_region_redundancy.cpp b/src/nfagraph/ng_region_redundancy.cpp
index 264e4312..1126d4d6 100644
--- a/src/nfagraph/ng_region_redundancy.cpp
+++ b/src/nfagraph/ng_region_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -60,7 +60,7 @@ struct RegionInfo {
 static
 bool regionHasUnexpectedAccept(const NGHolder &g, const u32 region,
                        const flat_set<ReportID> &expected_reports,
-                       const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                       const unordered_map<NFAVertex, u32> &region_map) {
     /* TODO: only check vertices connected to accept/acceptEOD */
     for (auto v : vertices_range(g)) {
         if (region != region_map.at(v)) {
@@ -84,7 +84,7 @@ bool regionHasUnexpectedAccept(const NGHolder &g, const u32 region,
 static
 void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
                          const map<u32, RegionInfo> &info,
-                         const ue2::unordered_map<NFAVertex, u32> &region_map,
+                         const unordered_map<NFAVertex, u32> &region_map,
                          set<u32> &deadRegions) {
     u32 region = region_map.at(cyc);
     CharReach cr = h[cyc].char_reach;
@@ -130,7 +130,7 @@ void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
 static
 void processCyclicStateReverse(NGHolder &h, NFAVertex cyc,
                          const map<u32, RegionInfo> &info,
-                         const ue2::unordered_map<NFAVertex, u32> &region_map,
+                         const unordered_map<NFAVertex, u32> &region_map,
                          set<u32> &deadRegions) {
     u32 region = region_map.at(cyc);
     CharReach cr = h[cyc].char_reach;
@@ -179,7 +179,7 @@ void processCyclicStateReverse(NGHolder &h, NFAVertex cyc,
 
 static
 map<u32, RegionInfo> buildRegionInfoMap(const NGHolder &g,
-                   const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                   const unordered_map<NFAVertex, u32> &region_map) {
     map<u32, RegionInfo> info;
 
     for (auto v : vertices_range(g)) {
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index 60ad2200..96c553de 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -47,11 +47,15 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/report_manager.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <map>
 #include <queue>
+#include <unordered_map>
+#include <unordered_set>
 
 #include <boost/graph/connected_components.hpp>
 #include <boost/graph/depth_first_search.hpp>
@@ -63,6 +67,7 @@
 using namespace std;
 using boost::depth_first_search;
 using boost::depth_first_visit;
+using boost::make_assoc_property_map;
 
 namespace ue2 {
 
@@ -117,7 +122,7 @@ struct ReachSubgraph {
 
 static
 void findInitDepths(const NGHolder &g,
-                    ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths) {
+                    unordered_map<NFAVertex, NFAVertexDepth> &depths) {
     auto d = calcDepths(g);
 
     for (auto v : vertices_range(g)) {
@@ -132,12 +137,12 @@ vector<NFAVertex> buildTopoOrder(const RepeatGraph &g) {
     /* Note: RepeatGraph is a filtered version of NGHolder and still has
      * NFAVertex as its vertex descriptor */
 
-    typedef ue2::unordered_set<NFAEdge> EdgeSet;
+    typedef unordered_set<NFAEdge> EdgeSet;
     EdgeSet deadEdges;
 
     // We don't have indices spanning [0,N] on our filtered graph, so we
     // provide a colour map.
-    ue2::unordered_map<NFAVertex, boost::default_color_type> colours;
+    unordered_map<NFAVertex, boost::default_color_type> colours;
 
     depth_first_search(g, visitor(BackEdges<EdgeSet>(deadEdges)).
                           color_map(make_assoc_property_map(colours)));
@@ -154,22 +159,22 @@ vector<NFAVertex> buildTopoOrder(const RepeatGraph &g) {
 
 static
 void proper_pred(const NGHolder &g, NFAVertex v,
-                 ue2::unordered_set<NFAVertex> &p) {
+                 unordered_set<NFAVertex> &p) {
     pred(g, v, &p);
     p.erase(v); // self-loops
 }
 
 static
 void proper_succ(const NGHolder &g, NFAVertex v,
-                 ue2::unordered_set<NFAVertex> &s) {
+                 unordered_set<NFAVertex> &s) {
     succ(g, v, &s);
     s.erase(v); // self-loops
 }
 
 static
 bool roguePredecessor(const NGHolder &g, NFAVertex v,
-                      const ue2::unordered_set<NFAVertex> &involved,
-                      const ue2::unordered_set<NFAVertex> &pred) {
+                      const unordered_set<NFAVertex> &involved,
+                      const unordered_set<NFAVertex> &pred) {
     u32 seen = 0;
 
     for (auto u : inv_adjacent_vertices_range(v, g)) {
@@ -194,8 +199,8 @@ bool roguePredecessor(const NGHolder &g, NFAVertex v,
 
 static
 bool rogueSuccessor(const NGHolder &g, NFAVertex v,
-                    const ue2::unordered_set<NFAVertex> &involved,
-                    const ue2::unordered_set<NFAVertex> &succ) {
+                    const unordered_set<NFAVertex> &involved,
+                    const unordered_set<NFAVertex> &succ) {
     u32 seen = 0;
     for (auto w : adjacent_vertices_range(v, g)) {
         if (contains(involved, w)) {
@@ -244,10 +249,10 @@ bool hasDifferentTops(const NGHolder &g, const vector<NFAVertex> &verts) {
 
 static
 bool vertexIsBad(const NGHolder &g, NFAVertex v,
-                 const ue2::unordered_set<NFAVertex> &involved,
-                 const ue2::unordered_set<NFAVertex> &tail,
-                 const ue2::unordered_set<NFAVertex> &pred,
-                 const ue2::unordered_set<NFAVertex> &succ,
+                 const unordered_set<NFAVertex> &involved,
+                 const unordered_set<NFAVertex> &tail,
+                 const unordered_set<NFAVertex> &pred,
+                 const unordered_set<NFAVertex> &succ,
                  const flat_set<ReportID> &reports) {
     DEBUG_PRINTF("check vertex %zu\n", g[v].index);
 
@@ -292,13 +297,13 @@ void splitSubgraph(const NGHolder &g, const deque<NFAVertex> &verts,
     // We construct a copy of the graph using just the vertices we want, rather
     // than using a filtered_graph -- this way is faster.
     NGHolder verts_g;
-    ue2::unordered_map<NFAVertex, NFAVertex> verts_map; // in g -> in verts_g
+    unordered_map<NFAVertex, NFAVertex> verts_map; // in g -> in verts_g
     fillHolder(&verts_g, g, verts, &verts_map);
 
-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
+    unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
     auto ug = createUnGraph(verts_g, true, true, old2new);
 
-    ue2::unordered_map<NFAUndirectedVertex, u32> repeatMap;
+    unordered_map<NFAUndirectedVertex, u32> repeatMap;
 
     size_t num = connected_components(ug, make_assoc_property_map(repeatMap));
     DEBUG_PRINTF("found %zu connected repeat components\n", num);
@@ -376,10 +381,10 @@ void checkReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
             continue;
         }
 
-        ue2::unordered_set<NFAVertex> involved(rsi.vertices.begin(),
-                                               rsi.vertices.end());
-        ue2::unordered_set<NFAVertex> tail(involved); // to look for back-edges.
-        ue2::unordered_set<NFAVertex> pred, succ;
+        unordered_set<NFAVertex> involved(rsi.vertices.begin(),
+                                          rsi.vertices.end());
+        unordered_set<NFAVertex> tail(involved); // to look for back-edges.
+        unordered_set<NFAVertex> pred, succ;
         proper_pred(g, rsi.vertices.front(), pred);
         proper_succ(g, rsi.vertices.back(), succ);
 
@@ -513,7 +518,7 @@ bool processSubgraph(const NGHolder &g, ReachSubgraph &rsi,
     NFAVertex first = rsi.vertices.front();
     NFAVertex last = rsi.vertices.back();
 
-    typedef ue2::unordered_map<NFAVertex, DistanceSet> DistanceMap;
+    typedef unordered_map<NFAVertex, DistanceSet> DistanceMap;
     DistanceMap dist;
 
     // Initial distance sets.
@@ -607,7 +612,7 @@ bool processSubgraph(const NGHolder &g, ReachSubgraph &rsi,
 
 static
 bool allPredsInSubgraph(NFAVertex v, const NGHolder &g,
-                        const ue2::unordered_set<NFAVertex> &involved) {
+                        const unordered_set<NFAVertex> &involved) {
     for (auto u : inv_adjacent_vertices_range(v, g)) {
         if (!contains(involved, u)) {
             return false;
@@ -618,8 +623,8 @@ bool allPredsInSubgraph(NFAVertex v, const NGHolder &g,
 
 static
 void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
-                     const ue2::unordered_set<NFAVertex> &involved,
-                     ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                     const unordered_set<NFAVertex> &involved,
+                     unordered_map<NFAVertex, NFAVertexDepth> &depths,
                      vector<NFAVertex> &tugs) {
     if (allPredsInSubgraph(v, g, involved)) {
         // We can transform this vertex into a tug trigger in-place.
@@ -698,7 +703,7 @@ u32 unpeelAmount(const NGHolder &g, const ReachSubgraph &rsi) {
 
 static
 void unpeelNearEnd(NGHolder &g, ReachSubgraph &rsi,
-                   ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                   unordered_map<NFAVertex, NFAVertexDepth> &depths,
                    vector<NFAVertex> *succs) {
     u32 unpeel = unpeelAmount(g, rsi);
     DEBUG_PRINTF("unpeeling %u vertices\n", unpeel);
@@ -757,17 +762,24 @@ void getSuccessors(const NGHolder &g, const ReachSubgraph &rsi,
  * NFA graph and replace it with a cyclic state. */
 static
 void replaceSubgraphWithSpecial(NGHolder &g, ReachSubgraph &rsi,
-                                vector<BoundedRepeatData> *repeats,
-                                ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-                                ue2::unordered_set<NFAVertex> &created) {
+                               vector<BoundedRepeatData> *repeats,
+                               unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                               unordered_set<NFAVertex> &created) {
     assert(!rsi.bad);
+    /* As we may need to unpeel 2 vertices, we need the width to be more than 2.
+     * This should only happen if the graph did not have redundancy pass
+     * performed on as vertex count checks would be prevent us reaching here.
+     */
+    if (rsi.repeatMax <= depth(2)) {
+        return;
+    }
     assert(rsi.repeatMin > depth(0));
     assert(rsi.repeatMax >= rsi.repeatMin);
-    assert(rsi.repeatMax > depth(2)); /* may need to unpeel 2 vertices */
+    assert(rsi.repeatMax > depth(2));
 
     DEBUG_PRINTF("entry\n");
 
-    const ue2::unordered_set<NFAVertex> involved(rsi.vertices.begin(),
+    const unordered_set<NFAVertex> involved(rsi.vertices.begin(),
                                                  rsi.vertices.end());
     vector<NFAVertex> succs;
     getSuccessors(g, rsi, &succs);
@@ -828,16 +840,16 @@ void replaceSubgraphWithSpecial(NGHolder &g, ReachSubgraph &rsi,
 static
 void replaceSubgraphWithLazySpecial(NGHolder &g, ReachSubgraph &rsi,
                           vector<BoundedRepeatData> *repeats,
-                          ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-                          ue2::unordered_set<NFAVertex> &created) {
+                          unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                          unordered_set<NFAVertex> &created) {
     assert(!rsi.bad);
     assert(rsi.repeatMin);
     assert(rsi.repeatMax >= rsi.repeatMin);
 
     DEBUG_PRINTF("entry\n");
 
-    const ue2::unordered_set<NFAVertex> involved(rsi.vertices.begin(),
-                                                 rsi.vertices.end());
+    const unordered_set<NFAVertex> involved(rsi.vertices.begin(),
+                                            rsi.vertices.end());
     vector<NFAVertex> succs;
     getSuccessors(g, rsi, &succs);
 
@@ -931,7 +943,7 @@ void reprocessSubgraph(const NGHolder &h, const Grey &grey,
  * involved in other repeats as a result of earlier repeat transformations. */
 static
 bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
-                  const ue2::unordered_set<NFAVertex> &created) {
+                  const unordered_set<NFAVertex> &created) {
     assert(!rsi.bad);
 
     if (created.empty()) {
@@ -993,8 +1005,8 @@ bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
  * idea to extend to cyclic states, too. */
 static
 void peelStartDotStar(const NGHolder &g,
-                    const ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-                    const Grey &grey, ReachSubgraph &rsi) {
+                      const unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                      const Grey &grey, ReachSubgraph &rsi) {
     if (rsi.vertices.size() < 1) {
         return;
     }
@@ -1072,8 +1084,8 @@ bool hasSkipEdges(const NGHolder &g, const ReachSubgraph &rsi) {
 /* depth info is valid as calculated at entry */
 static
 bool entered_at_fixed_offset(NFAVertex v, const NGHolder &g,
-            const ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-            const ue2::unordered_set<NFAVertex> &reached_by_fixed_tops) {
+            const unordered_map<NFAVertex, NFAVertexDepth> &depths,
+            const unordered_set<NFAVertex> &reached_by_fixed_tops) {
     DEBUG_PRINTF("|reached_by_fixed_tops| %zu\n",
                   reached_by_fixed_tops.size());
     if (is_triggered(g) && !contains(reached_by_fixed_tops, v)) {
@@ -1199,12 +1211,12 @@ CharReach predReach(const NGHolder &g, NFAVertex v) {
  */
 static
 void filterMap(const NGHolder &subg,
-               ue2::unordered_map<NFAVertex, NFAVertex> &vmap) {
+               unordered_map<NFAVertex, NFAVertex> &vmap) {
     NGHolder::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(subg);
-    const ue2::unordered_set<NFAVertex> remaining_verts(vi, ve);
+    const unordered_set<NFAVertex> remaining_verts(vi, ve);
 
-    ue2::unordered_map<NFAVertex, NFAVertex> fmap; // filtered map
+    unordered_map<NFAVertex, NFAVertex> fmap; // filtered map
 
     for (const auto &m : vmap) {
         if (contains(remaining_verts, m.second)) {
@@ -1219,7 +1231,7 @@ void filterMap(const NGHolder &subg,
  * the bounded repeat. */
 static
 void buildRepeatGraph(NGHolder &rg,
-                      ue2::unordered_map<NFAVertex, NFAVertex> &rg_map,
+                      unordered_map<NFAVertex, NFAVertex> &rg_map,
                       const NGHolder &g, const ReachSubgraph &rsi,
                       const map<u32, vector<vector<CharReach>>> &triggers) {
     cloneHolder(rg, g, &rg_map);
@@ -1230,7 +1242,7 @@ void buildRepeatGraph(NGHolder &rg,
     add_edge(rg.accept, rg.acceptEod, rg);
 
     // Find the set of vertices in rg involved in the repeat.
-    ue2::unordered_set<NFAVertex> rg_involved;
+    unordered_set<NFAVertex> rg_involved;
     for (const auto &v : rsi.vertices) {
         assert(contains(rg_map, v));
         rg_involved.insert(rg_map.at(v));
@@ -1272,7 +1284,7 @@ void buildRepeatGraph(NGHolder &rg,
  */
 static
 void buildInputGraph(NGHolder &lhs,
-                     ue2::unordered_map<NFAVertex, NFAVertex> &lhs_map,
+                     unordered_map<NFAVertex, NFAVertex> &lhs_map,
                      const NGHolder &g, const NFAVertex first,
                      const map<u32, vector<vector<CharReach>>> &triggers) {
     DEBUG_PRINTF("building lhs with first=%zu\n", g[first].index);
@@ -1326,8 +1338,8 @@ static const size_t MAX_SOLE_ENTRY_VERTICES = 10000;
  * single offset at runtime. See UE-1361. */
 static
 bool hasSoleEntry(const NGHolder &g, const ReachSubgraph &rsi,
-                  const ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-                  const ue2::unordered_set<NFAVertex> &reached_by_fixed_tops,
+                  const unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                  const unordered_set<NFAVertex> &reached_by_fixed_tops,
                   const map<u32, vector<vector<CharReach>>> &triggers) {
     DEBUG_PRINTF("checking repeat {%s,%s}\n", rsi.repeatMin.str().c_str(),
                  rsi.repeatMax.str().c_str());
@@ -1357,12 +1369,12 @@ bool hasSoleEntry(const NGHolder &g, const ReachSubgraph &rsi,
     }
 
     NGHolder rg;
-    ue2::unordered_map<NFAVertex, NFAVertex> rg_map;
+    unordered_map<NFAVertex, NFAVertex> rg_map;
     buildRepeatGraph(rg, rg_map, g, rsi, triggers);
     assert(rg.kind == g.kind);
 
     NGHolder lhs;
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
     buildInputGraph(lhs, lhs_map, g, first, triggers);
     assert(lhs.kind == g.kind);
 
@@ -1376,7 +1388,7 @@ bool hasSoleEntry(const NGHolder &g, const ReachSubgraph &rsi,
     // are in one region, vertices in the bounded repeat are in another.
     const u32 lhs_region = 1;
     const u32 repeat_region = 2;
-    ue2::unordered_map<NFAVertex, u32> region_map;
+    unordered_map<NFAVertex, u32> region_map;
 
     for (const auto &v : rsi.vertices) {
         assert(!is_special(v, g)); // no specials in repeats
@@ -1472,7 +1484,7 @@ struct StrawWalker {
 
     NFAVertex walk(NFAVertex v, vector<NFAVertex> &straw) const {
         DEBUG_PRINTF("walk from %zu\n", g[v].index);
-        ue2::unordered_set<NFAVertex> visited;
+        unordered_set<NFAVertex> visited;
         straw.clear();
 
         while (!is_special(v, g)) {
@@ -1593,7 +1605,7 @@ vector<CharReach> getUnionedTrigger(const NGHolder &g, const NFAVertex v) {
 
     vector<CharReach> trigger;
 
-    ue2::flat_set<NFAVertex> curr, next;
+    flat_set<NFAVertex> curr, next;
     insert(&curr, inv_adjacent_vertices(v, g));
 
     if (contains(curr, g.start)) {
@@ -1694,7 +1706,7 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
     assert(!done.empty());
 
     // Convert our path list into a set of unique triggers.
-    ue2::unordered_set<vector<CharReach>> unique_triggers;
+    ue2_unordered_set<vector<CharReach>> unique_triggers;
     for (const auto &path : done) {
         vector<CharReach> reach_path;
         for (auto jt = path.rbegin(), jte = path.rend(); jt != jte; ++jt) {
@@ -1742,8 +1754,8 @@ static
 void
 selectHistoryScheme(const NGHolder &g, const ReportManager *rm,
                     ReachSubgraph &rsi,
-                    const ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths,
-                    const ue2::unordered_set<NFAVertex> &reached_by_fixed_tops,
+                    const unordered_map<NFAVertex, NFAVertexDepth> &depths,
+                    const unordered_set<NFAVertex> &reached_by_fixed_tops,
                     const map<u32, vector<vector<CharReach>>> &triggers,
                     const vector<BoundedRepeatData> &all_repeats,
                     const bool simple_model_selection) {
@@ -1811,7 +1823,7 @@ selectHistoryScheme(const NGHolder &g, const ReportManager *rm,
 
 static
 void buildFeeder(NGHolder &g, const BoundedRepeatData &rd,
-                 ue2::unordered_set<NFAVertex> &created,
+                 unordered_set<NFAVertex> &created,
                  const vector<NFAVertex> &straw) {
     if (!g[rd.cyclic].char_reach.all()) {
         // Create another cyclic feeder state with flipped reach.  It has an
@@ -1858,7 +1870,7 @@ void buildFeeder(NGHolder &g, const BoundedRepeatData &rd,
  */
 static
 bool improveLeadingRepeat(NGHolder &g, BoundedRepeatData &rd,
-                          ue2::unordered_set<NFAVertex> &created,
+                          unordered_set<NFAVertex> &created,
                           const vector<BoundedRepeatData> &all_repeats) {
     assert(edge(g.startDs, g.startDs, g).second);
 
@@ -1962,7 +1974,7 @@ vector<NFAVertex> makeOwnStraw(NGHolder &g, BoundedRepeatData &rd,
  */
 static
 bool improveLeadingRepeatOutfix(NGHolder &g, BoundedRepeatData &rd,
-                                ue2::unordered_set<NFAVertex> &created,
+                                unordered_set<NFAVertex> &created,
                                 const vector<BoundedRepeatData> &all_repeats) {
     assert(g.kind == NFA_OUTFIX);
 
@@ -2060,7 +2072,7 @@ bool endsInAcceptEod(const NGHolder &g, const ReachSubgraph &rsi) {
 namespace {
 class pfti_visitor : public boost::default_dfs_visitor {
 public:
-    pfti_visitor(ue2::unordered_map<NFAVertex, depth> &top_depths_in,
+    pfti_visitor(unordered_map<NFAVertex, depth> &top_depths_in,
                  const depth &our_depth_in)
         : top_depths(top_depths_in), our_depth(our_depth_in) {}
 
@@ -2076,7 +2088,7 @@ public:
             top_depths[v] = our_depth;
         }
     }
-    ue2::unordered_map<NFAVertex, depth> &top_depths;
+    unordered_map<NFAVertex, depth> &top_depths;
     const depth &our_depth;
 };
 } // namespace
@@ -2084,14 +2096,14 @@ public:
 static
 void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
                           const NGHolder &g,
-                          ue2::unordered_set<NFAVertex> *reached_by_fixed_tops) {
+                          unordered_set<NFAVertex> *reached_by_fixed_tops) {
     if (fixed_depth_tops.empty()) {
         return; /* we will never find anything */
     }
 
     assert(!proper_out_degree(g.startDs, g));
-    ue2::unordered_map<NFAVertex, depth> top_depths;
-    vector<boost::default_color_type> colours(num_vertices(g));
+    unordered_map<NFAVertex, depth> top_depths;
+    auto colours = make_small_color_map(g);
 
     for (const auto &e : out_edges_range(g.start, g)) {
         NFAVertex v = target(e, g);
@@ -2121,9 +2133,7 @@ void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
         /* for each vertex reachable from v update its map to reflect that it is
          * reachable from a top of depth td. */
 
-        depth_first_visit(g, v, pfti_visitor(top_depths, td),
-                          make_iterator_property_map(colours.begin(),
-                                                     get(vertex_index, g)));
+        depth_first_visit(g, v, pfti_visitor(top_depths, td), colours);
     }
 
     for (const auto &v_depth : top_depths) {
@@ -2143,7 +2153,7 @@ void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
 static
 bool hasOverlappingRepeats(UNUSED const NGHolder &g,
                            const vector<BoundedRepeatData> &repeats) {
-    ue2::unordered_set<NFAVertex> involved;
+    unordered_set<NFAVertex> involved;
 
     for (const auto &br : repeats) {
         if (contains(involved, br.cyclic)) {
@@ -2178,7 +2188,7 @@ bool hasOverlappingRepeats(UNUSED const NGHolder &g,
  */
 static
 bool repeatIsNasty(const NGHolder &g, const ReachSubgraph &rsi,
-                   const ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths) {
+                   const unordered_map<NFAVertex, NFAVertexDepth> &depths) {
     if (num_vertices(g) > NFA_MAX_STATES) {
         // We may have no choice but to implement this repeat to get the graph
         // down to a tractable number of vertices.
@@ -2231,13 +2241,13 @@ void analyseRepeats(NGHolder &g, const ReportManager *rm,
 #ifndef NDEBUG
     // So we can assert that the number of tops hasn't changed at the end of
     // this analysis.
-    const ue2::flat_set<u32> allTops = getTops(g);
+    const flat_set<u32> allTops = getTops(g);
 #endif
 
     // Later on, we're (a little bit) dependent on depth information for
     // unpeeling and so forth. Note that these depths MUST be maintained when
     // new vertices are added.
-    ue2::unordered_map<NFAVertex, NFAVertexDepth> depths;
+    unordered_map<NFAVertex, NFAVertexDepth> depths;
     findInitDepths(g, depths);
 
     // Construct our list of subgraphs with the same reach using BGL magic.
@@ -2294,13 +2304,13 @@ void analyseRepeats(NGHolder &g, const ReportManager *rm,
     // could make this unnecessary?
     const unique_ptr<const NGHolder> orig_g(cloneHolder(g));
 
-    ue2::unordered_set<NFAVertex> reached_by_fixed_tops;
+    unordered_set<NFAVertex> reached_by_fixed_tops;
     if (is_triggered(g)) {
         populateFixedTopInfo(fixed_depth_tops, g, &reached_by_fixed_tops);
     }
 
     // Go to town on the remaining acceptable subgraphs.
-    ue2::unordered_set<NFAVertex> created;
+    unordered_set<NFAVertex> created;
     for (auto &rsi : rs) {
         DEBUG_PRINTF("subgraph (beginning vertex %zu) is a {%s,%s} repeat\n",
                      g[rsi.vertices.front()].index,
diff --git a/src/nfagraph/ng_repeat.h b/src/nfagraph/ng_repeat.h
index 2f14cb0c..cfd804b7 100644
--- a/src/nfagraph/ng_repeat.h
+++ b/src/nfagraph/ng_repeat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,7 +37,7 @@
 #include "ue2common.h"
 #include "nfa/repeat_internal.h"
 #include "util/depth.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <map>
 #include <vector>
@@ -122,7 +122,7 @@ void findRepeats(const NGHolder &h, u32 minRepeatVertices,
 struct PureRepeat {
     CharReach reach;
     DepthMinMax bounds;
-    ue2::flat_set<ReportID> reports;
+    flat_set<ReportID> reports;
 
     bool operator==(const PureRepeat &a) const {
         return reach == a.reach && bounds == a.bounds && reports == a.reports;
diff --git a/src/nfagraph/ng_restructuring.cpp b/src/nfagraph/ng_restructuring.cpp
index 32cdac23..704697e5 100644
--- a/src/nfagraph/ng_restructuring.cpp
+++ b/src/nfagraph/ng_restructuring.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,9 +131,9 @@ void getStateOrdering(NGHolder &g, const flat_set<NFAVertex> &tops,
 
 // Returns the number of states.
 static
-ue2::unordered_map<NFAVertex, u32>
+unordered_map<NFAVertex, u32>
 getStateIndices(const NGHolder &h, const vector<NFAVertex> &ordering) {
-    ue2::unordered_map<NFAVertex, u32> states;
+    unordered_map<NFAVertex, u32> states;
     for (const auto &v : vertices_range(h)) {
         states[v] = NO_STATE;
     }
diff --git a/src/nfagraph/ng_restructuring.h b/src/nfagraph/ng_restructuring.h
index bbd478d5..75d19c62 100644
--- a/src/nfagraph/ng_restructuring.h
+++ b/src/nfagraph/ng_restructuring.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,14 +28,16 @@
 
 /** \file
  * \brief State numbering and late graph restructuring code.
-
  */
+
 #ifndef NG_RESTRUCTURING_H
 #define NG_RESTRUCTURING_H
 
 #include "ng_holder.h"
 #include "ue2common.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
+
+#include <unordered_map>
 
 namespace ue2 {
 
@@ -48,14 +50,14 @@ static constexpr u32 NO_STATE = ~0;
 /**
  * \brief Gives each participating vertex in the graph a unique state index.
  */
-unordered_map<NFAVertex, u32>
+std::unordered_map<NFAVertex, u32>
 numberStates(NGHolder &h, const flat_set<NFAVertex> &tops);
 
 /**
  * \brief Counts the number of states (vertices with state indices) in the
  * graph.
  */
-u32 countStates(const unordered_map<NFAVertex, u32> &state_ids);
+u32 countStates(const std::unordered_map<NFAVertex, u32> &state_ids);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_revacc.cpp b/src/nfagraph/ng_revacc.cpp
index dc86dd44..0f932668 100644
--- a/src/nfagraph/ng_revacc.cpp
+++ b/src/nfagraph/ng_revacc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,8 @@
 #include "util/charreach.h"
 #include "util/graph_range.h"
 
+#include <set>
+
 using namespace std;
 
 namespace ue2 {
diff --git a/src/nfagraph/ng_small_literal_set.cpp b/src/nfagraph/ng_small_literal_set.cpp
index 1d7be65b..9c2d9ba3 100644
--- a/src/nfagraph/ng_small_literal_set.cpp
+++ b/src/nfagraph/ng_small_literal_set.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,8 +33,8 @@
 #include "ng_small_literal_set.h"
 
 #include "grey.h"
-#include "ng_util.h"
 #include "ng_holder.h"
+#include "ng_util.h"
 #include "rose/rose_build.h"
 #include "util/compare.h"
 #include "util/compile_context.h"
@@ -100,7 +100,7 @@ bool operator<(const sls_literal &a, const sls_literal &b) {
 
 static
 bool checkLongMixedSensitivityLiterals(
-        const map<sls_literal, ue2::flat_set<ReportID>> &literals) {
+        const map<sls_literal, flat_set<ReportID>> &literals) {
     const size_t len = MAX_MASK2_WIDTH;
 
     for (const sls_literal &lit : literals | map_keys) {
@@ -114,7 +114,7 @@ bool checkLongMixedSensitivityLiterals(
 
 static
 bool findLiterals(const NGHolder &g,
-                  map<sls_literal, ue2::flat_set<ReportID>> *literals) {
+                  map<sls_literal, flat_set<ReportID>> *literals) {
     vector<NFAVertex> order = getTopoOrdering(g);
 
     vector<set<sls_literal>> built(num_vertices(g));
@@ -198,7 +198,7 @@ bool findLiterals(const NGHolder &g,
 }
 
 static
-size_t min_period(const map<sls_literal, ue2::flat_set<ReportID>> &literals) {
+size_t min_period(const map<sls_literal, flat_set<ReportID>> &literals) {
     size_t rv = SIZE_MAX;
 
     for (const sls_literal &lit : literals | map_keys) {
@@ -222,9 +222,14 @@ bool handleSmallLiteralSets(RoseBuild &rose, const NGHolder &g,
         return false;
     }
 
+    if (!hasNarrowReachVertex(g, MAX_LITERAL_SET_SIZE * 2 + 1)) {
+        DEBUG_PRINTF("vertex with wide reach found\n");
+        return false;
+    }
+
     DEBUG_PRINTF("looking for literals\n");
 
-    map<sls_literal, ue2::flat_set<ReportID>> literals;
+    map<sls_literal, flat_set<ReportID>> literals;
     if (!findLiterals(g, &literals)) {
         DEBUG_PRINTF(":(\n");
         return false;
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 67438103..d23ac408 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -69,6 +69,8 @@
 
 #include <algorithm>
 #include <map>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 using namespace std;
@@ -103,7 +105,7 @@ struct som_plan {
 
 static
 bool regionCanEstablishSom(const NGHolder &g,
-                           const ue2::unordered_map<NFAVertex, u32> &regions,
+                           const unordered_map<NFAVertex, u32> &regions,
                            const u32 region, const vector<NFAVertex> &r_exits,
                            const vector<DepthMinMax> &depths) {
     if (region == regions.at(g.accept) ||
@@ -149,7 +151,7 @@ struct region_info {
 
 static
 void buildRegionMapping(const NGHolder &g,
-                        const ue2::unordered_map<NFAVertex, u32> &regions,
+                        const unordered_map<NFAVertex, u32> &regions,
                         map<u32, region_info> &info,
                         bool include_region_0 = false) {
     for (auto v : vertices_range(g)) {
@@ -228,7 +230,7 @@ void buildRegionMapping(const NGHolder &g,
 
 static
 bool validateXSL(const NGHolder &g,
-                 const ue2::unordered_map<NFAVertex, u32> &regions,
+                 const unordered_map<NFAVertex, u32> &regions,
                  const u32 region, const CharReach &escapes, u32 *bad_region) {
     /* need to check that the escapes escape all of the graph past region */
     u32 first_bad_region = ~0U;
@@ -251,7 +253,7 @@ bool validateXSL(const NGHolder &g,
 
 static
 bool validateEXSL(const NGHolder &g,
-                  const ue2::unordered_map<NFAVertex, u32> &regions,
+                  const unordered_map<NFAVertex, u32> &regions,
                   const u32 region, const CharReach &escapes,
                   const NGHolder &prefix, u32 *bad_region) {
     /* EXSL: To be a valid EXSL with escapes e, we require that all states
@@ -265,7 +267,7 @@ bool validateEXSL(const NGHolder &g,
     const vector<CharReach> escapes_vec(1, escapes);
     const vector<CharReach> notescapes_vec(1, ~escapes);
 
-    ue2::flat_set<NFAVertex> states;
+    flat_set<NFAVertex> states;
     /* turn on all states past the prefix */
     DEBUG_PRINTF("region %u is cutover\n", region);
     for (auto v : vertices_range(g)) {
@@ -278,7 +280,7 @@ bool validateEXSL(const NGHolder &g,
     states = execute_graph(g, escapes_vec, states);
 
     /* flood with any number of not escapes */
-    ue2::flat_set<NFAVertex> prev_states;
+    flat_set<NFAVertex> prev_states;
     while (prev_states != states) {
         prev_states = states;
         states = execute_graph(g, notescapes_vec, states);
@@ -288,7 +290,7 @@ bool validateEXSL(const NGHolder &g,
     /* find input starts to use for when we are running the prefix through as
      * when the escape character arrives we may be in matching the prefix
      * already */
-    ue2::flat_set<NFAVertex> prefix_start_states;
+    flat_set<NFAVertex> prefix_start_states;
     for (auto v : vertices_range(prefix)) {
         if (v != prefix.accept && v != prefix.acceptEod
             /* and as we have already made it past the prefix once */
@@ -353,7 +355,7 @@ bool isPossibleLock(const NGHolder &g,
 
 static
 unique_ptr<NGHolder>
-makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
+makePrefix(const NGHolder &g, const unordered_map<NFAVertex, u32> &regions,
            const region_info &curr, const region_info &next,
            bool renumber = true) {
     const vector<NFAVertex> &curr_exits = curr.exits;
@@ -368,12 +370,12 @@ makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
     deque<NFAVertex> lhs_verts;
     insert(&lhs_verts, lhs_verts.end(), vertices(g));
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map; // g -> prefix
+    unordered_map<NFAVertex, NFAVertex> lhs_map; // g -> prefix
     fillHolder(&prefix, g, lhs_verts, &lhs_map);
     prefix.kind = NFA_OUTFIX;
 
     // We need a reverse mapping to track regions.
-    ue2::unordered_map<NFAVertex, NFAVertex> rev_map; // prefix -> g
+    unordered_map<NFAVertex, NFAVertex> rev_map; // prefix -> g
     for (const auto &e : lhs_map) {
         rev_map.emplace(e.second, e.first);
     }
@@ -541,7 +543,7 @@ void setMidfixReports(ReportManager &rm, const som_plan &item,
 
 static
 bool finalRegion(const NGHolder &g,
-                 const ue2::unordered_map<NFAVertex, u32> &regions,
+                 const unordered_map<NFAVertex, u32> &regions,
                  NFAVertex v) {
     u32 region = regions.at(v);
     for (auto w : adjacent_vertices_range(v, g)) {
@@ -771,7 +773,7 @@ void fillHolderForLockCheck(NGHolder *out, const NGHolder &g,
 
 static
 void fillRoughMidfix(NGHolder *out, const NGHolder &g,
-                     const ue2::unordered_map<NFAVertex, u32> &regions,
+                     const unordered_map<NFAVertex, u32> &regions,
                      const map<u32, region_info> &info,
                      map<u32, region_info>::const_iterator picked) {
     /* as we are not the first prefix, we are probably not acyclic. We need to
@@ -941,7 +943,7 @@ bool isMandRegionBetween(map<u32, region_info>::const_iterator a,
 // (woot!); updates picked, plan and bad_region.
 static
 bool advancePlan(const NGHolder &g,
-                 const ue2::unordered_map<NFAVertex, u32> &regions,
+                 const unordered_map<NFAVertex, u32> &regions,
                  const NGHolder &prefix, bool stuck,
                  map<u32, region_info>::const_iterator &picked,
                  const map<u32, region_info>::const_iterator furthest,
@@ -1051,13 +1053,12 @@ void addReporterVertices(const region_info &r, const NGHolder &g,
 // Fetches the mappings of all preds of {accept, acceptEod} in this region.
 static
 void addMappedReporterVertices(const region_info &r, const NGHolder &g,
-                        const ue2::unordered_map<NFAVertex, NFAVertex> &mapping,
+                        const unordered_map<NFAVertex, NFAVertex> &mapping,
                         vector<NFAVertex> &reporters) {
     for (auto v : r.exits) {
         if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
             DEBUG_PRINTF("adding v=%zu\n", g[v].index);
-            ue2::unordered_map<NFAVertex, NFAVertex>::const_iterator it =
-                mapping.find(v);
+            auto it = mapping.find(v);
             assert(it != mapping.end());
             reporters.push_back(it->second);
         }
@@ -1068,9 +1069,9 @@ void addMappedReporterVertices(const region_info &r, const NGHolder &g,
 // from earlier regions.
 static
 void cloneGraphWithOneEntry(NGHolder &out, const NGHolder &g,
-                       const ue2::unordered_map<NFAVertex, u32> &regions,
+                       const unordered_map<NFAVertex, u32> &regions,
                        NFAVertex entry, const vector<NFAVertex> &enters,
-                       ue2::unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
+                       unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
     orig_to_copy.clear();
     cloneHolder(out, g, &orig_to_copy);
 
@@ -1095,7 +1096,7 @@ void cloneGraphWithOneEntry(NGHolder &out, const NGHolder &g,
 }
 
 static
-void expandGraph(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &regions,
+void expandGraph(NGHolder &g, unordered_map<NFAVertex, u32> &regions,
                  vector<NFAVertex> &enters) {
     assert(!enters.empty());
     const u32 split_region = regions.at(enters.front());
@@ -1178,11 +1179,11 @@ void expandGraph(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &regions,
 
 static
 bool doTreePlanningIntl(NGHolder &g,
-            const ue2::unordered_map<NFAVertex, u32> &regions,
+            const unordered_map<NFAVertex, u32> &regions,
             const map<u32, region_info> &info,
             map<u32, region_info>::const_iterator picked, u32 bad_region,
             u32 parent_plan,
-            const ue2::unordered_map<NFAVertex, NFAVertex> &copy_to_orig,
+            const unordered_map<NFAVertex, NFAVertex> &copy_to_orig,
             vector<som_plan> &plan, const Grey &grey) {
     assert(picked != info.end());
 
@@ -1341,7 +1342,7 @@ bool doTreePlanning(NGHolder &g,
         // regions.
 
         NGHolder g_path;
-        ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
+        unordered_map<NFAVertex, NFAVertex> orig_to_copy;
         cloneGraphWithOneEntry(g_path, g, g_regions, v, enters, orig_to_copy);
         auto regions = assignRegions(g_path);
         dumpHolder(g_path, regions, 14, "som_treepath", grey);
@@ -1375,7 +1376,7 @@ bool doTreePlanning(NGHolder &g,
         }
 
         // Construct reverse mapping from vertices in g_path to g.
-        ue2::unordered_map<NFAVertex, NFAVertex> copy_to_orig;
+        unordered_map<NFAVertex, NFAVertex> copy_to_orig;
         for (const auto &m : orig_to_copy) {
             copy_to_orig.insert(make_pair(m.second, m.first));
         }
@@ -1398,7 +1399,7 @@ enum dsp_behaviour {
 
 static
 bool doSomPlanning(NGHolder &g, bool stuck_in,
-                   const ue2::unordered_map<NFAVertex, u32> &regions,
+                   const unordered_map<NFAVertex, u32> &regions,
                    const map<u32, region_info> &info,
                    map<u32, region_info>::const_iterator picked,
                    vector<som_plan> &plan,
@@ -1734,8 +1735,6 @@ namespace {
 struct SomRevNfa {
     SomRevNfa(NFAVertex s, ReportID r, bytecode_ptr<NFA> n)
         : sink(s), report(r), nfa(move(n)) {}
-    SomRevNfa(SomRevNfa &&s) // MSVC2013 needs this for emplace
-        : sink(s.sink), report(s.report), nfa(move(s.nfa)) {}
     NFAVertex sink;
     ReportID report;
     bytecode_ptr<NFA> nfa;
@@ -1940,7 +1939,7 @@ map<u32, region_info>::const_iterator findLaterLiteral(const NGHolder &g,
 
 static
 bool attemptToBuildChainAfterSombe(SomSlotManager &ssm, NGHolder &g,
-                  const ue2::unordered_map<NFAVertex, u32> &regions,
+                  const unordered_map<NFAVertex, u32> &regions,
                   const map<u32, region_info> &info,
                   map<u32, region_info>::const_iterator picked,
                   const Grey &grey,
@@ -2014,7 +2013,7 @@ void setReportOnHaigPrefix(RoseBuild &rose, NGHolder &h) {
 
 static
 bool tryHaig(RoseBuild &rose, NGHolder &g,
-             const ue2::unordered_map<NFAVertex, u32> &regions,
+             const unordered_map<NFAVertex, u32> &regions,
              som_type som, u32 somPrecision,
              map<u32, region_info>::const_iterator picked,
              shared_ptr<raw_som_dfa> *haig, shared_ptr<NGHolder> *haig_prefix,
@@ -2062,7 +2061,7 @@ void roseAddHaigLiteral(RoseBuild &tb, const shared_ptr<NGHolder> &prefix,
 static
 sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const ExpressionInfo &expr,
                       u32 comp_id, som_type som,
-                      const ue2::unordered_map<NFAVertex, u32> &regions,
+                      const unordered_map<NFAVertex, u32> &regions,
                       const map<u32, region_info> &info,
                       map<u32, region_info>::const_iterator lower_bound) {
     DEBUG_PRINTF("entry\n");
@@ -2343,7 +2342,7 @@ bool splitOffLeadingLiterals(const NGHolder &g, set<ue2_literal> *lit_out,
         }
     }
 
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
     vector<NFAVertex> pivots;
     insert(&pivots, pivots.end(), adj_term1);
     splitRHS(g, pivots, rhs, &rhs_map);
@@ -2354,7 +2353,7 @@ bool splitOffLeadingLiterals(const NGHolder &g, set<ue2_literal> *lit_out,
 
 static
 void findBestLiteral(const NGHolder &g,
-                     const ue2::unordered_map<NFAVertex, u32> &regions,
+                     const unordered_map<NFAVertex, u32> &regions,
                      ue2_literal *lit_out, NFAVertex *v,
                      const CompileContext &cc) {
     map<u32, region_info> info;
@@ -2394,7 +2393,7 @@ void findBestLiteral(const NGHolder &g,
 
 static
 bool splitOffBestLiteral(const NGHolder &g,
-                         const ue2::unordered_map<NFAVertex, u32> &regions,
+                         const unordered_map<NFAVertex, u32> &regions,
                          ue2_literal *lit_out, NGHolder *lhs, NGHolder *rhs,
                          const CompileContext &cc) {
     NFAVertex v = NGHolder::null_vertex();
@@ -2406,8 +2405,8 @@ bool splitOffBestLiteral(const NGHolder &g,
 
     DEBUG_PRINTF("literal is '%s'\n", dumpString(*lit_out).c_str());
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(g, v, lhs, &lhs_map, rhs, &rhs_map);
 
@@ -2498,7 +2497,7 @@ bool doLitHaigSom(NG &ng, NGHolder &g, som_type som) {
 
 static
 bool doHaigLitHaigSom(NG &ng, NGHolder &g,
-                      const ue2::unordered_map<NFAVertex, u32> &regions,
+                      const unordered_map<NFAVertex, u32> &regions,
                       som_type som) {
     if (!ng.cc.grey.allowLitHaig) {
         return false;
@@ -2732,7 +2731,7 @@ bool trySombe(NG &ng, NGHolder &g, som_type som) {
 
 static
 map<u32, region_info>::const_iterator pickInitialSomCut(const NGHolder &g,
-                        const ue2::unordered_map<NFAVertex, u32> &regions,
+                        const unordered_map<NFAVertex, u32> &regions,
                         const map<u32, region_info> &info,
                         const vector<DepthMinMax> &depths) {
     map<u32, region_info>::const_iterator picked = info.end();
@@ -2757,7 +2756,7 @@ map<u32, region_info>::const_iterator pickInitialSomCut(const NGHolder &g,
 
 static
 map<u32, region_info>::const_iterator tryForLaterRevNfaCut(const NGHolder &g,
-                              const ue2::unordered_map<NFAVertex, u32> &regions,
+                              const unordered_map<NFAVertex, u32> &regions,
                               const map<u32, region_info> &info,
                               const vector<DepthMinMax> &depths,
                               const map<u32, region_info>::const_iterator &orig,
@@ -2849,7 +2848,7 @@ map<u32, region_info>::const_iterator tryForLaterRevNfaCut(const NGHolder &g,
 
 static
 unique_ptr<NGHolder> makePrefixForChain(NGHolder &g,
-                         const ue2::unordered_map<NFAVertex, u32> &regions,
+                         const unordered_map<NFAVertex, u32> &regions,
                          const map<u32, region_info> &info,
                          const map<u32, region_info>::const_iterator &picked,
                          vector<DepthMinMax> *depths, bool prefix_by_rev,
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index a3b6ee5f..1e7a41bb 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -54,7 +54,7 @@ vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
     // We operate on a temporary copy of the original graph here, so we don't
     // have to mutate the original.
     NGHolder g;
-    ue2::unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
+    unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
     cloneHolder(g, g_orig, &vmap);
 
     vector<NFAVertex> vstarts;
@@ -136,7 +136,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
         return false;
     }
 
-    ue2::flat_set<NFAVertex> states;
+    flat_set<NFAVertex> states;
     /* turn on all states (except starts - avoid suffix matches) */
     /* If we were doing (1) we would also except states leading to accepts -
        avoid prefix matches */
@@ -166,7 +166,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
 }
 
 bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
-                       const ue2::unordered_map<NFAVertex, u32> &region_map,
+                       const unordered_map<NFAVertex, u32> &region_map,
                        smgb_cache &cache) {
     /* Need to ensure all matches of the graph g up to u contain no infixes
      * which are also matches of the graph to u.
@@ -215,7 +215,7 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
         }
     }
 
-    ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
+    unordered_map<NFAVertex, NFAVertex> orig_to_copy;
     NGHolder c_g;
     cloneHolder(c_g, g, &orig_to_copy);
 
@@ -287,7 +287,7 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
 }
 
 bool sentClearsTail(const NGHolder &g,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const unordered_map<NFAVertex, u32> &region_map,
                     const NGHolder &sent, u32 last_head_region,
                     u32 *bad_region) {
     /* if a subsequent match from the prefix clears the rest of the pattern
@@ -312,7 +312,7 @@ bool sentClearsTail(const NGHolder &g,
      */
 
     u32 first_bad_region = ~0U;
-    ue2::flat_set<NFAVertex> states;
+    flat_set<NFAVertex> states;
     /* turn on all states */
     DEBUG_PRINTF("region %u is cutover\n", last_head_region);
     for (auto v : vertices_range(g)) {
diff --git a/src/nfagraph/ng_som_util.h b/src/nfagraph/ng_som_util.h
index 793dd2c3..e2d38642 100644
--- a/src/nfagraph/ng_som_util.h
+++ b/src/nfagraph/ng_som_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,9 +35,9 @@
 
 #include "ng_util.h"
 #include "util/depth.h"
-#include "util/ue2_containers.h"
 
 #include <map>
+#include <unordered_map>
 #include <vector>
 
 namespace ue2 {
@@ -61,7 +61,7 @@ struct smgb_cache : public mbsb_cache {
 };
 
 bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
-                       const ue2::unordered_map<NFAVertex, u32> &region_map,
+                       const std::unordered_map<NFAVertex, u32> &region_map,
                        smgb_cache &cache);
 
 /**
@@ -75,7 +75,7 @@ bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
  *   region ID associated with a tail state that is still on.
  */
 bool sentClearsTail(const NGHolder &g,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const std::unordered_map<NFAVertex, u32> &region_map,
                     const NGHolder &sent, u32 last_head_region,
                     u32 *bad_region);
 
diff --git a/src/nfagraph/ng_split.cpp b/src/nfagraph/ng_split.cpp
index 3c2baee4..91a099fc 100644
--- a/src/nfagraph/ng_split.cpp
+++ b/src/nfagraph/ng_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,7 +37,6 @@
 #include "util/container.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
 
 #include <map>
 #include <set>
@@ -63,12 +62,13 @@ void clearAccepts(NGHolder &g) {
 }
 
 static
-void filterSplitMap(const NGHolder &g, ue2::unordered_map<NFAVertex, NFAVertex> *out_map) {
-    ue2::unordered_set<NFAVertex> verts;
+void filterSplitMap(const NGHolder &g,
+                    unordered_map<NFAVertex, NFAVertex> *out_map) {
+    unordered_set<NFAVertex> verts;
     insert(&verts, vertices(g));
-    ue2::unordered_map<NFAVertex, NFAVertex>::iterator it = out_map->begin();
+    auto it = out_map->begin();
     while (it != out_map->end()) {
-        ue2::unordered_map<NFAVertex, NFAVertex>::iterator jt = it;
+        auto jt = it;
         ++it;
         if (!contains(verts, jt->second)) {
             out_map->erase(jt);
@@ -78,8 +78,8 @@ void filterSplitMap(const NGHolder &g, ue2::unordered_map<NFAVertex, NFAVertex>
 
 static
 void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
-              const vector<NFAVertex> &rhs_pivots,
-              NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map) {
+              const vector<NFAVertex> &rhs_pivots, NGHolder *lhs,
+              unordered_map<NFAVertex, NFAVertex> *lhs_map) {
     assert(lhs && lhs_map);
 
     cloneHolder(*lhs, base, lhs_map);
@@ -131,7 +131,7 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
 }
 
 void splitLHS(const NGHolder &base, NFAVertex pivot,
-              NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map) {
+              NGHolder *lhs, unordered_map<NFAVertex, NFAVertex> *lhs_map) {
     vector<NFAVertex> pivots(1, pivot);
     vector<NFAVertex> rhs_pivots;
     insert(&rhs_pivots, rhs_pivots.end(), adjacent_vertices(pivot, base));
@@ -139,7 +139,7 @@ void splitLHS(const NGHolder &base, NFAVertex pivot,
 }
 
 void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
-              NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
+              NGHolder *rhs, unordered_map<NFAVertex, NFAVertex> *rhs_map) {
     assert(rhs && rhs_map);
 
     cloneHolder(*rhs, base, rhs_map);
@@ -211,8 +211,8 @@ void findCommonSuccessors(const NGHolder &g, const vector<NFAVertex> &pivots,
 }
 
 void splitGraph(const NGHolder &base, const vector<NFAVertex> &pivots,
-                NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
-                NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
+                NGHolder *lhs, unordered_map<NFAVertex, NFAVertex> *lhs_map,
+                NGHolder *rhs, unordered_map<NFAVertex, NFAVertex> *rhs_map) {
     DEBUG_PRINTF("splitting graph at %zu vertices\n", pivots.size());
 
     assert(!has_parallel_edge(base));
@@ -235,8 +235,8 @@ void splitGraph(const NGHolder &base, const vector<NFAVertex> &pivots,
 }
 
 void splitGraph(const NGHolder &base, NFAVertex pivot,
-                NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
-                NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
+                NGHolder *lhs, unordered_map<NFAVertex, NFAVertex> *lhs_map,
+                NGHolder *rhs, unordered_map<NFAVertex, NFAVertex> *rhs_map) {
     vector<NFAVertex> pivots(1, pivot);
     splitGraph(base, pivots, lhs, lhs_map, rhs, rhs_map);
 }
diff --git a/src/nfagraph/ng_split.h b/src/nfagraph/ng_split.h
index 31c1cf35..9ddc0332 100644
--- a/src/nfagraph/ng_split.h
+++ b/src/nfagraph/ng_split.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,10 +33,10 @@
 #ifndef NG_SPLIT_H
 #define NG_SPLIT_H
 
-#include <vector>
-
 #include "ng_holder.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>
+#include <vector>
 
 namespace ue2 {
 
@@ -55,21 +55,21 @@ class NGHolder;
  *   vertices which have an edge to every pivot
  */
 void splitGraph(const NGHolder &base, NFAVertex pivot, NGHolder *lhs,
-                ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
+                std::unordered_map<NFAVertex, NFAVertex> *lhs_map,
                 NGHolder *rhs,
-                ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
+                std::unordered_map<NFAVertex, NFAVertex> *rhs_map);
 
 void splitGraph(const NGHolder &base, const std::vector<NFAVertex> &pivots,
                 NGHolder *lhs,
-                ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
+                std::unordered_map<NFAVertex, NFAVertex> *lhs_map,
                 NGHolder *rhs,
-                ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
+                std::unordered_map<NFAVertex, NFAVertex> *rhs_map);
 
 void splitLHS(const NGHolder &base, NFAVertex pivot, NGHolder *lhs,
-              ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map);
+              std::unordered_map<NFAVertex, NFAVertex> *lhs_map);
 
 void splitRHS(const NGHolder &base, const std::vector<NFAVertex> &pivots,
-              NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
+              NGHolder *rhs, std::unordered_map<NFAVertex, NFAVertex> *rhs_map);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index ebec3a4a..03495d14 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -104,7 +104,6 @@
 #include "ng_region.h"
 #include "ng_som_util.h"
 #include "ng_util.h"
-#include "ng_util.h"
 #include "util/container.h"
 #include "util/graph_range.h"
 #include "util/report_manager.h"
@@ -112,6 +111,8 @@
 
 #include <deque>
 #include <map>
+#include <unordered_map>
+#include <unordered_set>
 
 #include <boost/graph/depth_first_search.hpp>
 #include <boost/graph/reverse_graph.hpp>
@@ -120,13 +121,14 @@ using namespace std;
 
 namespace ue2 {
 
-typedef ue2::unordered_map<NFAVertex,
-                           ue2::unordered_set<NFAVertex> > PostDomTree;
+using PostDomTree = unordered_map<NFAVertex, unordered_set<NFAVertex>>;
 
 static
-void buildPDomTree(const NGHolder &g, PostDomTree &tree) {
-    ue2::unordered_map<NFAVertex, NFAVertex> postdominators =
-        findPostDominators(g);
+PostDomTree buildPDomTree(const NGHolder &g) {
+    PostDomTree tree;
+    tree.reserve(num_vertices(g));
+
+    auto postdominators = findPostDominators(g);
 
     for (auto v : vertices_range(g)) {
         if (is_special(v, g)) {
@@ -138,6 +140,7 @@ void buildPDomTree(const NGHolder &g, PostDomTree &tree) {
             tree[pdom].insert(v);
         }
     }
+    return tree;
 }
 
 /**
@@ -150,13 +153,13 @@ void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v,
                      const CharReach &cr, const NFAStateSet &init,
                      const vector<NFAVertex> &vByIndex, const PostDomTree &tree,
                      som_type som, const vector<DepthMinMax> &som_depths,
-                     const ue2::unordered_map<NFAVertex, u32> &region_map,
+                     const unordered_map<NFAVertex, u32> &region_map,
                      smgb_cache &cache) {
     DEBUG_PRINTF("build base squash mask for vertex %zu)\n", g[v].index);
 
     vector<NFAVertex> q;
 
-    PostDomTree::const_iterator it = tree.find(v);
+    auto it = tree.find(v);
     if (it != tree.end()) {
         q.insert(q.end(), it->second.begin(), it->second.end());
     }
@@ -272,9 +275,9 @@ void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) {
 static
 void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
                           const PostDomTree &pdom_tree, const NFAStateSet &init,
-                          map<NFAVertex, NFAStateSet> *squash, som_type som,
-                          const vector<DepthMinMax> &som_depths,
-                          const ue2::unordered_map<NFAVertex, u32> &region_map,
+                          unordered_map<NFAVertex, NFAStateSet> *squash,
+                          som_type som, const vector<DepthMinMax> &som_depths,
+                          const unordered_map<NFAVertex, u32> &region_map,
                           smgb_cache &cache) {
     deque<NFAVertex> remaining;
     for (const auto &m : *squash) {
@@ -316,37 +319,41 @@ void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
     }
 }
 
-/* If there are redundant states in the graph, it may be possible for two sibling
- * .* states to try to squash each other -- which should be prevented
+/* If there are redundant states in the graph, it may be possible for two
+ * sibling .* states to try to squash each other -- which should be prevented.
  *
  * Note: this situation should only happen if ng_equivalence has not been run.
  */
 static
 void clearMutualSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
-                          map<NFAVertex, NFAStateSet> &squash) {
+                          unordered_map<NFAVertex, NFAStateSet> &squash) {
     for (auto it = squash.begin(); it != squash.end();) {
         NFAVertex a = it->first;
         u32 a_index = g[a].index;
 
         NFAStateSet a_squash = ~it->second;  /* default is mask of survivors */
-        for (NFAStateSet::size_type b_index = a_squash.find_first();
-             b_index != a_squash.npos; b_index = a_squash.find_next(b_index)) {
+        for (auto b_index = a_squash.find_first(); b_index != a_squash.npos;
+             b_index = a_squash.find_next(b_index)) {
             assert(b_index != a_index);
             NFAVertex b = vByIndex[b_index];
-            if (!contains(squash, b)) {
+
+            auto b_it = squash.find(b);
+            if (b_it == squash.end()) {
                 continue;
             }
-            if (!squash[b].test(a_index)) {
+            auto &b_squash = b_it->second;
+            if (!b_squash.test(a_index)) {
                 /* b and a squash each other, prevent this */
                 DEBUG_PRINTF("removing mutual squash %u %zu\n",
                              a_index, b_index);
-                squash[b].set(a_index);
+                b_squash.set(a_index);
                 it->second.set(b_index);
             }
         }
 
         if (it->second.all()) {
-            DEBUG_PRINTF("%u is no longer an effictive squash state\n", a_index);
+            DEBUG_PRINTF("%u is no longer an effective squash state\n",
+                         a_index);
             it = squash.erase(it);
         } else {
             ++it;
@@ -354,16 +361,16 @@ void clearMutualSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
     }
 }
 
-map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
-    map<NFAVertex, NFAStateSet> squash;
+unordered_map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g,
+                                                    som_type som) {
+    unordered_map<NFAVertex, NFAStateSet> squash;
 
     // Number of bits to use for all our masks. If we're a triggered graph,
     // tops have already been assigned, so we don't have to account for them.
     const u32 numStates = num_vertices(g);
 
     // Build post-dominator tree.
-    PostDomTree pdom_tree;
-    buildPDomTree(g, pdom_tree);
+    auto pdom_tree = buildPDomTree(g);
 
     // Build list of vertices by state ID and a set of init states.
     vector<NFAVertex> vByIndex(numStates, NGHolder::null_vertex());
@@ -508,9 +515,11 @@ map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
  * -# squash only a few acyclic states
  */
 void filterSquashers(const NGHolder &g,
-                     map<NFAVertex, NFAStateSet> &squash) {
+                     unordered_map<NFAVertex, NFAStateSet> &squash) {
+    assert(hasCorrectlyNumberedVertices(g));
+
     DEBUG_PRINTF("filtering\n");
-    map<u32, NFAVertex> rev; /* vertex_index -> vertex */
+    vector<NFAVertex> rev(num_vertices(g)); /* vertex_index -> vertex */
     for (auto v : vertices_range(g)) {
         rev[g[v].index] = v;
     }
@@ -529,8 +538,8 @@ void filterSquashers(const NGHolder &g,
 
         NFAStateSet squashed = squash[v];
         squashed.flip(); /* default sense for mask of survivors */
-        for (NFAStateSet::size_type sq = squashed.find_first();
-             sq != squashed.npos; sq = squashed.find_next(sq)) {
+        for (auto sq = squashed.find_first(); sq != squashed.npos;
+             sq = squashed.find_next(sq)) {
             NFAVertex u = rev[sq];
             if (hasSelfLoop(u, g)) {
                 DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq);
@@ -619,7 +628,7 @@ static
 vector<NFAVertex> findUnreachable(const NGHolder &g) {
     const boost::reverse_graph<NGHolder, const NGHolder &> revg(g);
 
-    ue2::unordered_map<NFAVertex, boost::default_color_type> colours;
+    unordered_map<NFAVertex, boost::default_color_type> colours;
     colours.reserve(num_vertices(g));
 
     depth_first_visit(revg, g.acceptEod,
@@ -638,9 +647,9 @@ vector<NFAVertex> findUnreachable(const NGHolder &g) {
 
 /** Populates squash masks for states that can be switched off by highlander
  * (single match) reporters. */
-map<NFAVertex, NFAStateSet>
+unordered_map<NFAVertex, NFAStateSet>
 findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
-    map<NFAVertex, NFAStateSet> squash;
+    unordered_map<NFAVertex, NFAStateSet> squash;
 
     set<NFAVertex> verts;
     getHighlanderReporters(g, g.accept, rm, verts);
@@ -661,7 +670,7 @@ findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
         // cutting the appropriate out-edges to accept and seeing which
         // vertices become unreachable.
 
-        ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
+        unordered_map<NFAVertex, NFAVertex> orig_to_copy;
         NGHolder h;
         cloneHolder(h, g, &orig_to_copy);
         removeEdgesToAccept(h, orig_to_copy[v]);
diff --git a/src/nfagraph/ng_squash.h b/src/nfagraph/ng_squash.h
index 66621a7d..489f541e 100644
--- a/src/nfagraph/ng_squash.h
+++ b/src/nfagraph/ng_squash.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,9 +35,8 @@
 #include "ng_holder.h"
 #include "som/som.h"
 #include "ue2common.h"
-#include "util/ue2_containers.h"
 
-#include <map>
+#include <unordered_map>
 #include <boost/dynamic_bitset.hpp>
 
 namespace ue2 {
@@ -45,8 +44,10 @@ namespace ue2 {
 class NGHolder;
 class ReportManager;
 
-/** Dynamically-sized bitset, as an NFA can have an arbitrary number of states. */
-typedef boost::dynamic_bitset<> NFAStateSet;
+/**
+ * Dynamically-sized bitset, as an NFA can have an arbitrary number of states.
+ */
+using NFAStateSet = boost::dynamic_bitset<>;
 
 /**
  * Populates the squash mask for each vertex (i.e. the set of states to be left
@@ -54,16 +55,16 @@ typedef boost::dynamic_bitset<> NFAStateSet;
  *
  * The NFAStateSet in the output map is indexed by vertex_index.
  */
-std::map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g,
-                                               som_type som = SOM_NONE);
+std::unordered_map<NFAVertex, NFAStateSet>
+findSquashers(const NGHolder &g, som_type som = SOM_NONE);
 
 /** Filters out squash states intended only for use in DFA construction. */
 void filterSquashers(const NGHolder &g,
-                     std::map<NFAVertex, NFAStateSet> &squash);
+                     std::unordered_map<NFAVertex, NFAStateSet> &squash);
 
 /** Populates squash masks for states that can be switched off by highlander
  * (single match) reporters. */
-std::map<NFAVertex, NFAStateSet>
+std::unordered_map<NFAVertex, NFAStateSet>
 findHighlanderSquashers(const NGHolder &g, const ReportManager &rm);
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_undirected.h b/src/nfagraph/ng_undirected.h
index 1e27ad79..036adcbf 100644
--- a/src/nfagraph/ng_undirected.h
+++ b/src/nfagraph/ng_undirected.h
@@ -37,7 +37,7 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <vector>
 
@@ -71,8 +71,8 @@ template <typename Graph>
 NFAUndirectedGraph createUnGraph(const Graph &g,
            bool excludeStarts,
            bool excludeAccepts,
-           unordered_map<typename Graph::vertex_descriptor,
-                         NFAUndirectedVertex> &old2new) {
+           std::unordered_map<typename Graph::vertex_descriptor,
+                              NFAUndirectedVertex> &old2new) {
     NFAUndirectedGraph ug;
     size_t idx = 0;
 
@@ -97,7 +97,7 @@ NFAUndirectedGraph createUnGraph(const Graph &g,
 
     // Track seen edges so that we don't insert parallel edges.
     using Vertex = typename Graph::vertex_descriptor;
-    unordered_set<std::pair<Vertex, Vertex>> seen;
+    ue2_unordered_set<std::pair<Vertex, Vertex>> seen;
     seen.reserve(num_edges(g));
     auto make_ordered_edge = [](Vertex a, Vertex b) {
         return std::make_pair(std::min(a, b), std::max(a, b));
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index 0776fa04..59c73498 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -39,6 +39,7 @@
 #include "nfa/limex_limits.h" // for NFA_MAX_TOP_MASKS.
 #include "parser/position.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/ue2string.h"
@@ -47,12 +48,14 @@
 #include <limits>
 #include <map>
 #include <set>
+#include <unordered_map>
+#include <unordered_set>
+
 #include <boost/graph/filtered_graph.hpp>
 #include <boost/graph/topological_sort.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
-using boost::default_color_type;
 using boost::make_filtered_graph;
 using boost::make_assoc_property_map;
 
@@ -214,8 +217,8 @@ bool isFloating(const NGHolder &g) {
 
 bool isAcyclic(const NGHolder &g) {
     try {
-        boost::depth_first_search(g, visitor(DetectCycles(g))
-                                     .root_vertex(g.start));
+        boost::depth_first_search(g, DetectCycles(g), make_small_color_map(g),
+                                  g.start);
     } catch (const CycleFound &) {
         return false;
     }
@@ -226,15 +229,12 @@ bool isAcyclic(const NGHolder &g) {
 /** True if the graph has a cycle reachable from the given source vertex. */
 bool hasReachableCycle(const NGHolder &g, NFAVertex src) {
     assert(hasCorrectlyNumberedVertices(g));
-    vector<default_color_type> colors(num_vertices(g));
 
     try {
         // Use depth_first_visit, rather than depth_first_search, so that we
         // only search from src.
-        auto index_map = get(vertex_index, g);
         boost::depth_first_visit(g, src, DetectCycles(g),
-                                 make_iterator_property_map(colors.begin(),
-                                                            index_map));
+                                 make_small_color_map(g));
     } catch (const CycleFound &) {
         return true;
     }
@@ -246,7 +246,8 @@ bool hasBigCycles(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     set<NFAEdge> dead;
     BackEdges<set<NFAEdge>> backEdgeVisitor(dead);
-    boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
+    boost::depth_first_search(g, backEdgeVisitor, make_small_color_map(g),
+                              g.start);
 
     for (const auto &e : dead) {
         if (source(e, g) != target(e, g)) {
@@ -257,6 +258,12 @@ bool hasBigCycles(const NGHolder &g) {
     return false;
 }
 
+bool hasNarrowReachVertex(const NGHolder &g, size_t max_reach_count) {
+    return any_of_in(vertices_range(g), [&](NFAVertex v) {
+        return !is_special(v, g) && g[v].char_reach.count() < max_reach_count;
+    });
+}
+
 bool can_never_match(const NGHolder &g) {
     assert(edge(g.accept, g.acceptEod, g).second);
     if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
@@ -347,24 +354,19 @@ vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
 
     // Use the same colour map for both DFS and topological_sort below: avoids
     // having to reallocate it, etc.
-    const size_t num_verts = num_vertices(g);
-    vector<default_color_type> colour(num_verts);
+    auto colors = make_small_color_map(g);
 
-    using EdgeSet = ue2::unordered_set<NFAEdge>;
+    using EdgeSet = unordered_set<NFAEdge>;
     EdgeSet backEdges;
     BackEdges<EdgeSet> be(backEdges);
 
-    auto index_map = get(vertex_index, g);
-    depth_first_search(g, visitor(be).root_vertex(g.start)
-                                     .color_map(make_iterator_property_map(
-                                                colour.begin(), index_map)));
+    depth_first_search(g, visitor(be).root_vertex(g.start).color_map(colors));
 
     auto acyclic_g = make_filtered_graph(g, make_bad_edge_filter(&backEdges));
 
     vector<NFAVertex> ordering;
-    ordering.reserve(num_verts);
-    topological_sort(acyclic_g, back_inserter(ordering),
-        color_map(make_iterator_property_map(colour.begin(), index_map)));
+    ordering.reserve(num_vertices(g));
+    topological_sort(acyclic_g, back_inserter(ordering), color_map(colors));
 
     reorderSpecials(g, ordering);
 
@@ -373,7 +375,7 @@ vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
 
 static
 void mustBeSetBefore_int(NFAVertex u, const NGHolder &g,
-                         vector<default_color_type> &vertexColor) {
+                         decltype(make_small_color_map(NGHolder())) &colors) {
     set<NFAVertex> s;
     insert(&s, adjacent_vertices(u, g));
 
@@ -390,10 +392,8 @@ void mustBeSetBefore_int(NFAVertex u, const NGHolder &g,
 
     auto prefix = make_filtered_graph(g, make_bad_edge_filter(&dead));
 
-    depth_first_visit(
-        prefix, g.start, make_dfs_visitor(boost::null_visitor()),
-        make_iterator_property_map(vertexColor.begin(),
-                                   get(vertex_index, g)));
+    depth_first_visit(prefix, g.start, make_dfs_visitor(boost::null_visitor()),
+                      colors);
 }
 
 bool mustBeSetBefore(NFAVertex u, NFAVertex v, const NGHolder &g,
@@ -406,14 +406,14 @@ bool mustBeSetBefore(NFAVertex u, NFAVertex v, const NGHolder &g,
         return cache.cache[key];
     }
 
-    vector<default_color_type> vertexColor(num_vertices(g));
-    mustBeSetBefore_int(u, g, vertexColor);
+    auto colors = make_small_color_map(g);
+    mustBeSetBefore_int(u, g, colors);
 
     for (auto vi : vertices_range(g)) {
         auto key2 = make_pair(g[u].index, g[vi].index);
         DEBUG_PRINTF("adding %zu %zu\n", key2.first, key2.second);
         assert(!contains(cache.cache, key2));
-        bool value = vertexColor[g[vi].index] == boost::white_color;
+        bool value = get(colors, vi) == small_color::white;
         cache.cache[key2] = value;
         assert(contains(cache.cache, key2));
     }
@@ -450,8 +450,8 @@ void appendLiteral(NGHolder &h, const ue2_literal &s) {
     }
 }
 
-ue2::flat_set<u32> getTops(const NGHolder &h) {
-    ue2::flat_set<u32> tops;
+flat_set<u32> getTops(const NGHolder &h) {
+    flat_set<u32> tops;
     for (const auto &e : out_edges_range(h.start, h)) {
         insert(&tops, h[e].tops);
     }
@@ -470,7 +470,7 @@ void setTops(NGHolder &h, u32 top) {
 
 void clearReports(NGHolder &g) {
     DEBUG_PRINTF("clearing reports without an accept edge\n");
-    ue2::unordered_set<NFAVertex> allow;
+    unordered_set<NFAVertex> allow;
     insert(&allow, inv_adjacent_vertices(g.accept, g));
     insert(&allow, inv_adjacent_vertices(g.acceptEod, g));
     allow.erase(g.accept); // due to stylised edge.
@@ -494,7 +494,7 @@ void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new) {
 
 static
 void fillHolderOutEdges(NGHolder &out, const NGHolder &in,
-                        const ue2::unordered_map<NFAVertex, NFAVertex> &v_map,
+                        const unordered_map<NFAVertex, NFAVertex> &v_map,
                         NFAVertex u) {
     NFAVertex u_new = v_map.at(u);
 
@@ -516,9 +516,9 @@ void fillHolderOutEdges(NGHolder &out, const NGHolder &in,
 }
 
 void fillHolder(NGHolder *outp, const NGHolder &in, const deque<NFAVertex> &vv,
-                ue2::unordered_map<NFAVertex, NFAVertex> *v_map_out) {
+                unordered_map<NFAVertex, NFAVertex> *v_map_out) {
     NGHolder &out = *outp;
-    ue2::unordered_map<NFAVertex, NFAVertex> &v_map = *v_map_out;
+    unordered_map<NFAVertex, NFAVertex> &v_map = *v_map_out;
 
     out.kind = in.kind;
 
@@ -600,7 +600,7 @@ void cloneHolder(NGHolder &out, const NGHolder &in) {
 }
 
 void cloneHolder(NGHolder &out, const NGHolder &in,
-                 ue2::unordered_map<NFAVertex, NFAVertex> *mapping) {
+                 unordered_map<NFAVertex, NFAVertex> *mapping) {
     cloneHolder(out, in);
     vector<NFAVertex> out_verts(num_vertices(in));
     for (auto v : vertices_range(out)) {
@@ -623,7 +623,7 @@ unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
 
 void reverseHolder(const NGHolder &g_in, NGHolder &g) {
     // Make the BGL do the grunt work.
-    ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
+    unordered_map<NFAVertex, NFAVertex> vertexMap;
     boost::transpose_graph(g_in, g,
                 orig_to_copy(boost::make_assoc_property_map(vertexMap)));
 
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 1d3a6f32..3cc9c7c3 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -32,16 +32,17 @@
 #ifndef NG_UTIL_H
 #define NG_UTIL_H
 
-#include <map>
-#include <vector>
+#include "ng_holder.h"
+#include "ue2common.h"
+#include "util/flat_containers.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
 
 #include <boost/graph/depth_first_search.hpp> // for default_dfs_visitor
 
-#include "ng_holder.h"
-#include "ue2common.h"
-#include "util/graph.h"
-#include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include <map>
+#include <unordered_map>
+#include <vector>
 
 namespace ue2 {
 
@@ -233,6 +234,12 @@ bool hasReachableCycle(const NGHolder &g, NFAVertex src);
 /** True if g has any cycles which are not self-loops. */
 bool hasBigCycles(const NGHolder &g);
 
+/**
+ * \brief True if g has at least one non-special vertex with reach smaller than
+ * max_reach_count. The default of 200 is pretty conservative.
+ */
+bool hasNarrowReachVertex(const NGHolder &g, size_t max_reach_count = 200);
+
 /** Returns the set of all vertices that appear in any of the graph's cycles. */
 std::set<NFAVertex> findVerticesInCycles(const NGHolder &g);
 
@@ -266,12 +273,12 @@ void appendLiteral(NGHolder &h, const ue2_literal &s);
  * \a in). A vertex mapping is returned in \a v_map_out. */
 void fillHolder(NGHolder *outp, const NGHolder &in,
                 const std::deque<NFAVertex> &vv,
-                unordered_map<NFAVertex, NFAVertex> *v_map_out);
+                std::unordered_map<NFAVertex, NFAVertex> *v_map_out);
 
 /** \brief Clone the graph in \a in into graph \a out, returning a vertex
  * mapping in \a v_map_out. */
 void cloneHolder(NGHolder &out, const NGHolder &in,
-                 unordered_map<NFAVertex, NFAVertex> *v_map_out);
+                 std::unordered_map<NFAVertex, NFAVertex> *v_map_out);
 
 /** \brief Clone the graph in \a in into graph \a out. */
 void cloneHolder(NGHolder &out, const NGHolder &in);
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 4195045c..9ce732c2 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -57,13 +57,14 @@
 #include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/insertion_ordered.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/target_info.h"
 #include "util/ue2string.h"
-#include "util/ue2_containers.h"
 
 #include <set>
 #include <utility>
@@ -559,7 +560,7 @@ void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
 static
 void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
                         set<NFAVertex> *cand_raw) {
-    ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
+    auto dominators = findDominators(g);
 
     set<NFAVertex> accepts;
 
@@ -1023,8 +1024,8 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
     DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
@@ -1076,24 +1077,21 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
         insert(&splitter_reports, base_graph[v].reports);
     }
 
-    /* find the targets of each source vertex; note the use of vectors to
+    /* find the targets of each source vertex; insertion_ordered_map used to
      * preserve deterministic ordering */
-    vector<RoseInVertex> sources;
-    map<RoseInVertex, vector<RoseInVertex>> images;
+    insertion_ordered_map<RoseInVertex, vector<RoseInVertex>> images;
     for (const RoseInEdge &e : ee) {
         RoseInVertex src = source(e, vg);
         RoseInVertex dest = target(e, vg);
-        if (!contains(images, src)) {
-            sources.push_back(src);
-        }
         images[src].push_back(dest);
         remove_edge(e, vg);
     }
 
     map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
 
-    for (const auto &u : sources) {
-        const auto &image = images[u];
+    for (const auto &m : images) {
+        const auto &u = m.first;
+        const auto &image = m.second;
 
         if (contains(verts_by_image, image)) {
             for (RoseInVertex v : verts_by_image[image]) {
@@ -1198,7 +1196,8 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
                      const vector<RoseInEdge> &to_cut,
                      const vector<NFAEdge> &cut,
                      const map<NFAEdge, set<ue2_literal>> &cut_lits) {
-    DEBUG_PRINTF("splitting %s:\n", to_string(h.kind).c_str());
+    DEBUG_PRINTF("splitting %s (%zu vertices)\n", to_string(h.kind).c_str(),
+                 num_vertices(h));
 
     /* create literal vertices and connect preds */
     unordered_set<RoseInVertex> done_sources;
@@ -1216,7 +1215,7 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             NFAVertex pivot = target(e, h);
 
             DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
-            ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+            unordered_map<NFAVertex, NFAVertex> temp_map;
             shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
             splitLHS(h, pivot, new_lhs.get(), &temp_map);
 
@@ -1233,7 +1232,9 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             renumber_vertices(*new_lhs);
             renumber_edges(*new_lhs);
 
-            DEBUG_PRINTF("    into lhs %s\n", to_string(new_lhs->kind).c_str());
+            DEBUG_PRINTF("    into lhs %s (%zu vertices)\n",
+                         to_string(new_lhs->kind).c_str(),
+                         num_vertices(*new_lhs));
 
             assert(hasCorrectlyNumberedVertices(*new_lhs));
             assert(hasCorrectlyNumberedEdges(*new_lhs));
@@ -1295,14 +1296,15 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
                effort */
 
             if (!contains(done_rhs, adj)) {
-                ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+                unordered_map<NFAVertex, NFAVertex> temp_map;
                 shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
                 splitRHS(h, adj, new_rhs.get(), &temp_map);
                 remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
                 remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
                 renumber_edges(*new_rhs);
-                DEBUG_PRINTF("    into rhs %s\n",
-                              to_string(new_rhs->kind).c_str());
+                DEBUG_PRINTF("    into rhs %s (%zu vertices)\n",
+                             to_string(new_rhs->kind).c_str(),
+                             num_vertices(*new_rhs));
                 done_rhs.emplace(adj, new_rhs);
                 assert(isCorrectlyTopped(*new_rhs));
             }
@@ -1552,6 +1554,12 @@ void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
             continue;
         }
 
+        if (g[e].dfa) {
+            /* if we removed any more states, we would need to rebuild the
+             * the dfa which can be time consuming. */
+            continue;
+        }
+
         assert(!g[t].delay);
         const ue2_literal &lit = g[t].s;
 
@@ -1669,6 +1677,8 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
             /* already removed redundant parts of literals */
             return;
         }
+
+        assert(!ig[e].dfa);
     }
 
     map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
@@ -1701,6 +1711,11 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
             continue;
         }
 
+        if (!delay) {
+            /* unable to trim graph --> no point swapping to new holder */
+            continue;
+        }
+
         assert(isCorrectlyTopped(*h_new));
         graphs[right] = make_pair(h_new, delay);
     }
@@ -1726,8 +1741,7 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
 static
 void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
                                         const CompileContext &cc) {
-    vector<NGHolder *> seen_order;
-    map<NGHolder *, vector<RoseInEdge>> infixes;
+    insertion_ordered_map<NGHolder *, vector<RoseInEdge>> infixes;
 
     for (const RoseInEdge &e : edges_range(g)) {
         RoseInVertex s = source(e, g);
@@ -1742,20 +1756,23 @@ void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
         }
 
         assert(!g[t].delay);
+        if (g[e].dfa) {
+            /* if we removed any more states, we would need to rebuild the
+             * the dfa which can be time consuming. */
+            continue;
+        }
 
         NGHolder *h = g[e].graph.get();
-        if (!contains(infixes, h)) {
-            seen_order.push_back(h);
-        }
         infixes[h].push_back(e);
     }
 
-    for (NGHolder *h : seen_order) {
-        removeRedundantLiteralsFromInfix(*h, g, infixes[h], cc);
+    for (const auto &m : infixes) {
+        NGHolder *h = m.first;
+        const auto &edges = m.second;
+        removeRedundantLiteralsFromInfix(*h, g, edges, cc);
     }
 }
 
-
 static
 void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
     removeRedundantLiteralsFromPrefixes(g, cc);
@@ -2067,13 +2084,13 @@ void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
     STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
     RoseInVertex start = getStart(vg);
 
+    insertion_ordered_map<NGHolder *, vector<RoseInEdge>> prefixes;
     bool changed;
     u32 gen = 0;
     do {
         DEBUG_PRINTF("gen %u\n", gen);
         changed = false;
-        vector<NGHolder *> seen_order;
-        map<NGHolder *, vector<RoseInEdge> > prefixes;
+        prefixes.clear();
 
         /* find prefixes */
         for (const RoseInEdge &e : out_edges_range(start, vg)) {
@@ -2081,9 +2098,6 @@ void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
             assert(vg[target(e, vg)].type == RIV_LITERAL);
             if (vg[e].graph) {
                 NGHolder *h = vg[e].graph.get();
-                if (!contains(prefixes, h)) {
-                    seen_order.push_back(h);
-                }
                 prefixes[h].push_back(e);
             }
         }
@@ -2093,14 +2107,16 @@ void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
         }
 
         /* look for bad prefixes and try to split */
-        for (NGHolder *h : seen_order) {
+        for (const auto &m : prefixes) {
+            NGHolder *h = m.first;
+            const auto &edges = m.second;
             depth max_width = findMaxWidth(*h);
             if (willBeTransient(max_width, cc)
                 || willBeAnchoredTable(max_width, cc.grey)) {
                 continue;
             }
 
-            changed = improvePrefix(*h, vg, prefixes[h], cc);
+            changed = improvePrefix(*h, vg, edges, cc);
         }
     } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
 }
@@ -2128,24 +2144,25 @@ void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
     if (!cc.grey.violetExtractStrongLiterals) {
         return;
     }
-    STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
-    set<NGHolder *> stuck;
 
+    STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
+
+    unordered_set<NGHolder *> stuck;
+    insertion_ordered_map<NGHolder *, vector<RoseInEdge>> edges_by_graph;
     bool changed;
+
     do {
         changed = false;
 
-        vector<NGHolder *> seen_order;
-        map<NGHolder *, vector<RoseInEdge> > edges_by_graph;
+        edges_by_graph.clear();
         for (const RoseInEdge &ve : edges_range(vg)) {
             if (vg[source(ve, vg)].type != RIV_LITERAL) {
                 continue;
             }
+
             if (vg[ve].graph) {
-                if (!contains(edges_by_graph, vg[ve].graph.get())) {
-                    seen_order.push_back(vg[ve].graph.get());
-                }
-                edges_by_graph[vg[ve].graph.get()].push_back(ve);
+                NGHolder *h = vg[ve].graph.get();
+                edges_by_graph[h].push_back(ve);
             }
         }
 
@@ -2154,12 +2171,14 @@ void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
             return;
         }
 
-        for (NGHolder *g : seen_order) {
+        for (const auto &m : edges_by_graph) {
+            NGHolder *g = m.first;
+            const auto &edges = m.second;
             if (contains(stuck, g)) {
                 DEBUG_PRINTF("already known to be bad\n");
                 continue;
             }
-            bool rv = extractStrongLiteral(*g, vg, edges_by_graph[g], cc);
+            bool rv = extractStrongLiteral(*g, vg, edges, cc);
             if (rv) {
                 changed = true;
             } else {
@@ -2207,8 +2226,7 @@ void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
 
     RoseInVertex start = getStart(vg);
 
-    set<NGHolder *> weak;
-    vector<NGHolder *> ordered_weak;
+    unordered_set<NGHolder *> weak;
 
     for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
         /* outfixes shouldn't have made it this far */
@@ -2224,22 +2242,22 @@ void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
 
             NGHolder *h = vg[e].graph.get();
             DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
-            if (!contains(weak, h)) {
-                weak.insert(h);
-                ordered_weak.push_back(h);
-            }
+            weak.insert(h);
         }
     }
 
-    map<NGHolder *, vector<RoseInEdge> > weak_edges;
+    insertion_ordered_map<NGHolder *, vector<RoseInEdge>> weak_edges;
     for (const RoseInEdge &ve : edges_range(vg)) {
-        if (contains(weak, vg[ve].graph.get())) {
-            weak_edges[vg[ve].graph.get()].push_back(ve);
+        NGHolder *h = vg[ve].graph.get();
+        if (contains(weak, h)) {
+            weak_edges[h].push_back(ve);
         }
     }
 
-    for (NGHolder *h : ordered_weak) {
-        improveInfix(*h, vg, weak_edges[h], cc);
+    for (const auto &m : weak_edges) {
+        NGHolder *h = m.first;
+        const auto &edges = m.second;
+        improveInfix(*h, vg, edges, cc);
     }
 }
 
@@ -2395,8 +2413,8 @@ void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
     STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
 
     RoseInVertex accept = getPrimaryAccept(vg);
-    map<const NGHolder *, vector<RoseInEdge> > suffixes;
-    vector<const NGHolder *> ordered_suffixes;
+
+    insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> suffixes;
 
     /* find suffixes */
     for (const RoseInEdge &e : in_edges_range(accept, vg)) {
@@ -2405,15 +2423,14 @@ void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
         assert(vg[e].graph); /* non suffix paths should be wired to other
                                 accepts */
         const NGHolder *h = vg[e].graph.get();
-        if (!contains(suffixes, h)) {
-            ordered_suffixes.push_back(h);
-        }
         suffixes[h].push_back(e);
     }
 
     /* look at suffixes and try to split */
-    for (const NGHolder *h : ordered_suffixes) {
-        replaceSuffixWithInfix(*h, vg, suffixes[h], cc);
+    for (const auto &m : suffixes) {
+        const NGHolder *h = m.first;
+        const auto &edges = m.second;
+        replaceSuffixWithInfix(*h, vg, edges, cc);
     }
 }
 
@@ -2497,20 +2514,18 @@ void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
         return;
     }
 
-    map<const NGHolder *, vector<RoseInEdge> > right_edges;
-    vector<const NGHolder *> ordered_graphs;
+    insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
     for (const RoseInEdge &ve : edges_range(vg)) {
         if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
             const NGHolder *h = vg[ve].graph.get();
-            if (!contains(right_edges, h)) {
-                ordered_graphs.push_back(h);
-            }
             right_edges[h].push_back(ve);
         }
     }
 
-    for (const NGHolder *h : ordered_graphs) {
-        lookForDoubleCut(*h, right_edges[h], vg, cc.grey);
+    for (const auto &m : right_edges) {
+        const NGHolder *h = m.first;
+        const auto &edges = m.second;
+        lookForDoubleCut(*h, edges, vg, cc.grey);
     }
 }
 
@@ -2635,24 +2650,22 @@ void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
         return;
     }
 
+    insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
     bool changed;
     do {
         changed = false;
 
-        map<const NGHolder *, vector<RoseInEdge> > right_edges;
-        vector<const NGHolder *> ordered_graphs;
+        right_edges.clear();
         for (const RoseInEdge &ve : edges_range(vg)) {
             if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
                 const NGHolder *h = vg[ve].graph.get();
-                if (!contains(right_edges, h)) {
-                    ordered_graphs.push_back(h);
-                }
                 right_edges[h].push_back(ve);
             }
         }
 
-        for (const NGHolder *h : ordered_graphs) {
-            const vector<RoseInEdge> &ee = right_edges[h];
+        for (const auto &m : right_edges) {
+            const NGHolder *h = m.first;
+            const vector<RoseInEdge> &ee = m.second;
             bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
             if (!rv && h->kind != NFA_SUFFIX) {
                 rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
@@ -2680,39 +2693,34 @@ static
 void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
     u32 gen = 0;
 
-    vector<RoseInVertex> prev = {getStart(vg)};
+    insertion_ordered_set<RoseInVertex> prev({getStart(vg)});
+    insertion_ordered_set<RoseInVertex> curr;
 
     while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
-        /* collect vertices in edge order for determinism */
-        vector<RoseInVertex> curr;
-        set<RoseInVertex> curr_seen;
+        curr.clear();
         for (RoseInVertex u : prev) {
             for (auto v : adjacent_vertices_range(u, vg)) {
-                if (curr_seen.insert(v).second) {
-                    curr.push_back(v);
-                }
+                curr.insert(v);
             }
         }
 
-        map<const NGHolder *, vector<RoseInEdge>> rightfixes;
-        vector<NGHolder *> ordered_graphs;
+        insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> rightfixes;
         for (RoseInVertex v : curr) {
             for (const RoseInEdge &e : out_edges_range(v, vg)) {
                 if (vg[e].graph) {
                     NGHolder *h = vg[e].graph.get();
-                    if (!contains(rightfixes, h)) {
-                        ordered_graphs.push_back(h);
-                    }
                     rightfixes[h].push_back(e);
                 }
             }
         }
 
-        for (const NGHolder *h : ordered_graphs) {
-            lookForCleanSplit(*h, rightfixes[h], vg, cc);
+        for (const auto &m : rightfixes) {
+            const NGHolder *h = m.first;
+            const auto &edges = m.second;
+            lookForCleanSplit(*h, edges, vg, cc);
         }
 
-        prev = curr;
+        prev = std::move(curr);
         gen++;
     }
 }
@@ -2828,9 +2836,9 @@ bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h,
 #define MAX_EDGES_FOR_IMPLEMENTABILITY 50
 
 static
-bool splitForImplementabilty(RoseInGraph &vg, NGHolder &h,
-                             const vector<RoseInEdge> &edges,
-                             const CompileContext &cc) {
+bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
+                              const vector<RoseInEdge> &edges,
+                              const CompileContext &cc) {
     vector<pair<ue2_literal, u32>> succ_lits;
     DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
                   to_string(h.kind).c_str(), num_vertices(h), edges.size());
@@ -2867,7 +2875,7 @@ bool splitForImplementabilty(RoseInGraph &vg, NGHolder &h,
     }
 
     DEBUG_PRINTF("trying to netflow\n");
-    bool rv =  doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
+    bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
     DEBUG_PRINTF("done\n");
 
     return rv;
@@ -2882,28 +2890,33 @@ bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
     bool changed = false;
     bool need_to_recalc = false;
     u32 added_count = 0;
+    unordered_set<shared_ptr<NGHolder>> good; /* known to be implementable */
     do {
         changed = false;
         DEBUG_PRINTF("added %u\n", added_count);
-        map<const NGHolder *, vector<RoseInEdge> > edges_by_graph;
-        vector<NGHolder *> graphs;
+        insertion_ordered_map<shared_ptr<NGHolder>,
+                              vector<RoseInEdge>> edges_by_graph;
         for (const RoseInEdge &ve : edges_range(vg)) {
-            if (vg[ve].graph) {
-                NGHolder *h = vg[ve].graph.get();
-                if (!contains(edges_by_graph, h)) {
-                    graphs.push_back(h);
-                }
+            if (vg[ve].graph && !vg[ve].dfa) {
+                auto &h = vg[ve].graph;
                 edges_by_graph[h].push_back(ve);
             }
         }
-        for (NGHolder *h : graphs) {
+        for (auto &m : edges_by_graph) {
+            auto &h = m.first;
+            if (contains(good, h)) {
+                continue;
+            }
+            reduceGraphEquivalences(*h, cc);
             if (isImplementableNFA(*h, &rm, cc)) {
+                good.insert(h);
                 continue;
             }
 
-            if (tryForEarlyDfa(*h, cc)
-                && doEarlyDfa(rose, vg, *h, edges_by_graph[h], final_chance, rm,
-                              cc)) {
+            const auto &edges = m.second;
+
+            if (tryForEarlyDfa(*h, cc) &&
+                doEarlyDfa(rose, vg, *h, edges, final_chance, rm, cc)) {
                 continue;
             }
 
@@ -2912,8 +2925,12 @@ bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
                 return false;
             }
 
-            if (splitForImplementabilty(vg, *h, edges_by_graph[h], cc)) {
+            if (splitForImplementability(vg, *h, edges, cc)) {
                 added_count++;
+                if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
+                    DEBUG_PRINTF("added_count hit limit\n");
+                    return false;
+                }
                 changed = true;
                 continue;
             }
@@ -2921,9 +2938,7 @@ bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
             return false;
         }
 
-        if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
-            return false;
-        }
+        assert(added_count <= MAX_IMPLEMENTABLE_SPLITS);
 
         if (changed) {
             removeRedundantLiterals(vg, cc);
@@ -2952,6 +2967,13 @@ RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance,
         return vg;
     }
 
+    /* Avoid running the Violet analysis at all on graphs with no vertices with
+     * small reach, since we will not be able to extract any literals. */
+    if (!hasNarrowReachVertex(h)) {
+        DEBUG_PRINTF("fail, no vertices with small reach\n");
+        return vg;
+    }
+
     DEBUG_PRINTF("hello world\n");
 
     /* Step 1: avoid outfixes as we always have to run them. */
diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp
index c2e9eb1a..4c33220c 100644
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@@ -37,6 +37,7 @@
 #include "ue2common.h"
 #include "util/depth.h"
 #include "util/graph.h"
+#include "util/graph_small_color_map.h"
 
 #include <deque>
 #include <vector>
@@ -143,7 +144,7 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
     vector<int> distance(num);
-    vector<boost::default_color_type> colors(num);
+    auto colors = make_small_color_map(h);
 
     auto index_map = get(&NFAGraphVertexProps::index, g);
 
@@ -151,15 +152,15 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
     dag_shortest_paths(g, src,
         distance_map(make_iterator_property_map(distance.begin(), index_map))
             .weight_map(boost::make_constant_property<NFAEdge>(-1))
-            .color_map(make_iterator_property_map(colors.begin(), index_map)));
+            .color_map(colors));
 
     depth acceptDepth, acceptEodDepth;
-    if (colors.at(NODE_ACCEPT) == boost::white_color) {
+    if (get(colors, h.accept) == small_color::white) {
         acceptDepth = depth::unreachable();
     } else {
         acceptDepth = depth(-1 * distance.at(NODE_ACCEPT));
     }
-    if (colors.at(NODE_ACCEPT_EOD) == boost::white_color) {
+    if (get(colors, h.acceptEod) == small_color::white) {
         acceptEodDepth = depth::unreachable();
     } else {
         acceptEodDepth = depth(-1 * distance.at(NODE_ACCEPT_EOD));
diff --git a/src/parser/ComponentRepeat.cpp b/src/parser/ComponentRepeat.cpp
index ff02703c..09f59d05 100644
--- a/src/parser/ComponentRepeat.cpp
+++ b/src/parser/ComponentRepeat.cpp
@@ -234,7 +234,7 @@ void ComponentRepeat::optimise(bool connected_to_sds) {
 }
 
 bool ComponentRepeat::vacuous_everywhere() const {
-    return !m_min;
+    return !m_min || sub_comp->vacuous_everywhere();
 }
 
 bool ComponentRepeat::checkEmbeddedStartAnchor(bool at_start) const {
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index ce9ca865..8643aebf 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -53,8 +53,8 @@
 #include "parser/Parser.h"
 #include "ue2common.h"
 #include "util/compare.h"
+#include "util/flat_containers.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
 #include "util/unicode_def.h"
 #include "util/verify_types.h"
 
@@ -1950,7 +1950,7 @@ unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
     unsigned groupIndex = 1;
 
     // Set storing group names that are currently in use.
-    ue2::flat_set<string> groupNames;
+    flat_set<string> groupNames;
 
     // Root sequence.
     unique_ptr<ComponentSequence> rootSeq = ue2::make_unique<ComponentSequence>();
diff --git a/src/parser/buildstate.cpp b/src/parser/buildstate.cpp
index eb25550b..75cfbb7b 100644
--- a/src/parser/buildstate.cpp
+++ b/src/parser/buildstate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,8 +39,10 @@
 #include "nfagraph/ng_builder.h"
 #include "util/charreach.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
+#include "util/hash.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <iterator>
@@ -449,7 +451,7 @@ unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
  * Scans through a list of positions and retains only the highest priority
  * version of a given (position, flags) entry. */
 void cleanupPositions(vector<PositionInfo> &a) {
-    ue2::unordered_set<pair<Position, int>> seen; // track dupes
+    ue2_unordered_set<pair<Position, int>> seen;
 
     vector<PositionInfo> out;
     out.reserve(a.size()); // output should be close to input in size.
diff --git a/src/parser/check_refs.cpp b/src/parser/check_refs.cpp
index fae68f74..60b5b6ba 100644
--- a/src/parser/check_refs.cpp
+++ b/src/parser/check_refs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,7 @@
 #include "ConstComponentVisitor.h"
 #include "parse_error.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <sstream>
 
@@ -114,7 +114,7 @@ public:
 ReferenceVisitor::~ReferenceVisitor() {}
 
 void checkReferences(const Component &root, unsigned int groupIndices,
-                     const ue2::flat_set<std::string> &groupNames) {
+                     const flat_set<std::string> &groupNames) {
     ReferenceVisitor vis(groupIndices, groupNames);
     root.accept(vis);
 }
diff --git a/src/parser/check_refs.h b/src/parser/check_refs.h
index ede44896..26912fb8 100644
--- a/src/parser/check_refs.h
+++ b/src/parser/check_refs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,14 +26,16 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Component tree analysis that checks that references (such as
  * back-refs, conditionals) have valid referents.
  */
-#ifndef PARSER_CHECK_REFS_H_
-#define PARSER_CHECK_REFS_H_
 
-#include "util/ue2_containers.h"
+#ifndef PARSER_CHECK_REFS_H
+#define PARSER_CHECK_REFS_H
+
+#include "util/flat_containers.h"
 
 #include <string>
 
@@ -43,8 +45,8 @@ class Component;
 class ComponentSequence;
 
 void checkReferences(const Component &root, unsigned int groupIndices,
-                     const ue2::flat_set<std::string> &groupNames);
+                     const flat_set<std::string> &groupNames);
 
 } // namespace ue2
 
-#endif // PARSER_CHECK_REFS_H_
+#endif // PARSER_CHECK_REFS_H
diff --git a/src/rose/block.c b/src/rose/block.c
index fc72c6e9..2c493219 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -178,13 +178,11 @@ void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
     assert(!scratch->tctxt.filledDelayedSlots);
 
     const u64a som = 0;
-    const size_t match_len = 0;
     const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     // Note: we ignore the result, as this is the last thing to ever happen on
     // a scan.
-    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                   flags);
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags);
 }
 
 /**
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index 82537241..9e36d091 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,13 +51,12 @@ static really_inline
 int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch,
                       u64a som, u64a offset, ReportID id, const char from_mpv) {
     const u32 program = id;
-    const size_t match_len = 0; // Unused in this path.
     u8 flags = ROSE_PROG_FLAG_IN_CATCHUP;
     if (from_mpv) {
         flags |= ROSE_PROG_FLAG_FROM_MPV;
     }
 
-    roseRunProgram(rose, scratch, program, som, offset, match_len, flags);
+    roseRunProgram(rose, scratch, program, som, offset, flags);
 
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
diff --git a/src/rose/match.c b/src/rose/match.c
index daf81eac..5d1b6e07 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -66,9 +66,8 @@ void printMatch(const struct core_info *ci, u64a start, u64a end) {
 }
 #endif
 
-hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
-                                     void *ctx) {
-    struct hs_scratch *scratch = ctx;
+hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id,
+                                     struct hs_scratch *scratch) {
     struct RoseContext *tctx = &scratch->tctxt;
     struct core_info *ci = &scratch->core_info;
     const struct RoseEngine *t = ci->rose;
@@ -77,9 +76,9 @@ hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
     u64a real_end = ci->buf_offset - rb_len + end + 1; // index after last byte
 
 #ifdef DEBUG
-    DEBUG_PRINTF("REBUILD MATCH id=%u offsets=[%llu,%llu]: ", id,
-                 start + ci->buf_offset - rb_len, real_end);
-    printMatch(ci, start + ci->buf_offset - rb_len, real_end);
+    DEBUG_PRINTF("REBUILD MATCH id=%u end offset@%llu]: ", id, real_end);
+    u64a start = real_end < 8 ? 1 : real_end - 7;
+    printMatch(ci, start, real_end);
     printf("\n");
 #endif
 
@@ -87,10 +86,9 @@ hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
 
     assert(id && id < t->size); // id is a program offset
     const u64a som = 0;
-    const size_t match_len = end - start + 1;
     const u8 flags = 0;
     UNUSED hwlmcb_rv_t rv =
-        roseRunProgram(t, scratch, id, som, real_end, match_len, flags);
+        roseRunProgram(t, scratch, id, som, real_end, flags);
     assert(rv != HWLM_TERMINATE_MATCHING);
 
     /* we are just repopulating the delay queue, groups should be
@@ -200,8 +198,6 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
         return MO_HALT_MATCHING;
     }
 
-    const size_t match_len = 0;
-
     /* delayed literals need to be delivered before real literals; however
      * delayed literals only come from the floating table so if we are going
      * to deliver a literal here it must be too early for a delayed literal */
@@ -216,8 +212,8 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
 
     // Note that the "id" we have been handed is the program offset.
     const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
-    if (roseRunProgram(t, scratch, id, start, real_end, match_len,
-                       flags) == HWLM_TERMINATE_MATCHING) {
+    if (roseRunProgram(t, scratch, id, start, real_end, flags)
+                       == HWLM_TERMINATE_MATCHING) {
         assert(can_stop_matching(scratch));
         DEBUG_PRINTF("caller requested termination\n");
         return MO_HALT_MATCHING;
@@ -237,12 +233,12 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
 static really_inline
 hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
                                    struct hs_scratch *scratch, u64a end,
-                                   size_t match_len, u32 id) {
+                                   u32 id) {
     DEBUG_PRINTF("id=%u\n", id);
     assert(id && id < t->size); // id is an offset into bytecode
     const u64a som = 0;
     const u8 flags = 0;
-    return roseRunProgram_i(t, scratch, id, som, end, match_len, flags);
+    return roseRunProgram_i(t, scratch, id, som, end, flags);
 }
 
 static rose_inline
@@ -274,7 +270,7 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t,
         const u64a som = 0;
         const u8 flags = 0;
         hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, offset,
-                                        0, flags);
+                                        flags);
         DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
         /* delayed literals can't safely set groups.
@@ -311,7 +307,7 @@ hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t,
         const u64a som = 0;
         const u8 flags = 0;
         hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, curr_loc,
-                                        0, flags);
+                                        flags);
         DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
         /* anchored literals can't safely set groups.
@@ -476,17 +472,16 @@ anchored_leftovers:;
 }
 
 static really_inline
-hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
-    struct hs_scratch *scratch = ctxt;
+hwlmcb_rv_t roseCallback_i(size_t end, u32 id, struct hs_scratch *scratch) {
     struct RoseContext *tctx = &scratch->tctxt;
     const struct RoseEngine *t = scratch->core_info.rose;
 
     u64a real_end = end + tctx->lit_offset_adjust;
 
 #if defined(DEBUG)
-    DEBUG_PRINTF("MATCH id=%u offsets=[%llu,%llu]: ", id,
-                 start + tctx->lit_offset_adjust, real_end);
-    printMatch(&scratch->core_info, start + tctx->lit_offset_adjust, real_end);
+    DEBUG_PRINTF("MATCH id=%u end offset@%llu: ", id, real_end);
+    u64a start = real_end < 8 ? 1 : real_end - 7;
+    printMatch(&scratch->core_info, start, real_end);
     printf("\n");
 #endif
     DEBUG_PRINTF("last end %llu\n", tctx->lastEndOffset);
@@ -510,8 +505,7 @@ hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
         return HWLM_TERMINATE_MATCHING;
     }
 
-    size_t match_len = end - start + 1;
-    rv = roseProcessMatchInline(t, scratch, real_end, match_len, id);
+    rv = roseProcessMatchInline(t, scratch, real_end, id);
 
     DEBUG_PRINTF("DONE groups=0x%016llx\n", tctx->groups);
 
@@ -524,15 +518,15 @@ hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
     return HWLM_TERMINATE_MATCHING;
 }
 
-hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
-    return roseCallback_i(start, end, id, ctxt);
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch) {
+    return roseCallback_i(end, id, scratch);
 }
 
-hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctxt) {
-    struct hs_scratch *scratch = ctxt;
+hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id,
+                                 struct hs_scratch *scratch) {
     const struct RoseEngine *t = scratch->core_info.rose;
 
-    return roseCallback_i(start, end, id, ctxt) & t->floating_group_mask;
+    return roseCallback_i(end, id, scratch) & t->floating_group_mask;
 }
 
 /**
@@ -567,10 +561,9 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
     scratch->tctxt.minMatchOffset = stream_offset;
 
     const u64a som = 0;
-    const size_t match_len = 0;
     const u8 flags = 0;
     hwlmcb_rv_t rv = roseRunProgram(rose, scratch, program, som, stream_offset,
-                                    match_len, flags);
+                                    flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
@@ -588,10 +581,9 @@ int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) {
 
     // Our match ID is the program offset.
     const u32 program = id;
-    const size_t match_len = 0; // Unused in this path.
     const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
     hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, start, end, match_len, flags);
+        roseRunProgram(rose, scratch, program, start, end, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
diff --git a/src/rose/match.h b/src/rose/match.h
index b69ff158..0d4fb19c 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,10 +52,11 @@ int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context);
 
 /* Callbacks, defined in match.c */
 
-hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctx);
-hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctx);
-hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
-                                     void *ctx);
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch);
+hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id,
+                                 struct hs_scratch *scratch);
+hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id,
+                                     struct hs_scratch *scratch);
 int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx);
 
 /* Common code, used all over Rose runtime */
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 23532d40..2f2a6aa3 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,8 +43,6 @@ int roseNfaEarliestSom(u64a start, UNUSED u64a end, UNUSED ReportID id,
 
 hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
-                           u64a som, u64a end, size_t match_len,
-                           u8 prog_flags) {
-    return roseRunProgram_i(t, scratch, programOffset, som, end, match_len,
-                            prog_flags);
+                           u64a som, u64a end, u8 prog_flags) {
+    return roseRunProgram_i(t, scratch, programOffset, som, end, prog_flags);
 }
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index b140a2bc..e6ce9bdb 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -69,7 +69,7 @@
 
 hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
-                           u64a som, u64a end, size_t match_len, u8 prog_flags);
+                           u64a som, u64a end, u8 prog_flags);
 
 /* Inline implementation follows. */
 
@@ -1838,8 +1838,7 @@ void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
 static rose_inline
 hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                              struct hs_scratch *scratch, u32 programOffset,
-                             u64a som, u64a end, UNUSED size_t match_len,
-                             u8 prog_flags) {
+                             u64a som, u64a end, u8 prog_flags) {
     DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
                  som, end, prog_flags);
 
@@ -2571,6 +2570,24 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                 }
             }
             PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(INCLUDED_JUMP) {
+                if (scratch->fdr_conf) {
+                    // squash the bucket of included literal
+                    u8 shift = scratch->fdr_conf_offset & ~7U;
+                    u64a mask = ((~(u64a)ri->squash) << shift);
+                    *(scratch->fdr_conf) &= mask;
+
+                    pc = getByOffset(t, ri->child_offset);
+                    pc_base = pc;
+                    programOffset = (const u8 *)pc_base -(const u8 *)t;
+                    DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n",
+                                 pc_base, pc, ri->child_offset, ri->squash);
+                    work_done = 0;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
         }
     }
 
diff --git a/src/rose/rose.h b/src/rose/rose.h
index 9a50f0e9..b29519b6 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,7 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
                        struct hs_scratch *scratch);
 
-hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *context);
+hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch);
 
 int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context);
 
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index cbb925f7..ca3ba369 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -42,8 +42,8 @@
 #include "rose_in_graph.h"
 #include "util/bytecode_ptr.h"
 #include "util/charreach.h"
+#include "util/flat_containers.h"
 #include "util/noncopyable.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 
 #include <memory>
@@ -73,7 +73,7 @@ public:
 
     /** \brief True if we can not establish that at most a single callback will
      * be generated at a given offset from this set of reports. */
-    virtual bool requiresDedupeSupport(const ue2::flat_set<ReportID> &reports)
+    virtual bool requiresDedupeSupport(const flat_set<ReportID> &reports)
         const = 0;
 };
 
@@ -85,7 +85,7 @@ public:
 
     /** \brief Adds a single literal. */
     virtual void add(bool anchored, bool eod, const ue2_literal &lit,
-                     const ue2::flat_set<ReportID> &ids) = 0;
+                     const flat_set<ReportID> &ids) = 0;
 
     virtual bool addRose(const RoseInGraph &ig, bool prefilter) = 0;
     virtual bool addSombeRose(const RoseInGraph &ig) = 0;
@@ -99,17 +99,17 @@ public:
 
     /** \brief Returns true if we were able to add it as a mask. */
     virtual bool add(bool anchored, const std::vector<CharReach> &mask,
-                     const ue2::flat_set<ReportID> &reports) = 0;
+                     const flat_set<ReportID> &reports) = 0;
 
     /** \brief Attempts to add the graph to the anchored acyclic table. Returns
      * true on success. */
     virtual bool addAnchoredAcyclic(const NGHolder &graph) = 0;
 
     virtual bool validateMask(const std::vector<CharReach> &mask,
-                              const ue2::flat_set<ReportID> &reports,
+                              const flat_set<ReportID> &reports,
                               bool anchored, bool eod) const = 0;
     virtual void addMask(const std::vector<CharReach> &mask,
-                         const ue2::flat_set<ReportID> &reports, bool anchored,
+                         const flat_set<ReportID> &reports, bool anchored,
                          bool eod) = 0;
 
     /** \brief Construct a runtime implementation. */
@@ -134,10 +134,6 @@ std::unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
 bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
                    const ReportManager &rm, const CompileContext &cc);
 
-/* used by heuristics to determine the small write engine. High numbers are
- * intended to indicate a lightweight rose. */
-u32 roseQuality(const RoseEngine *t);
-
 bool roseIsPureLiteral(const RoseEngine *t);
 
 size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay);
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 4c895caf..71f1667d 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -55,6 +55,7 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/insertion_ordered.h"
 #include "util/make_unique.h"
 #include "util/noncopyable.h"
 #include "util/order_check.h"
@@ -85,7 +86,7 @@ struct RoseBuildData : noncopyable {
 
     /** Edges we've transformed (in \ref transformAnchoredLiteralOverlap) which
      * require ANCH history to prevent overlap. */
-    ue2::unordered_set<RoseInEdge> anch_history_edges;
+    unordered_set<RoseInEdge> anch_history_edges;
 
     /** True if we're tracking Start of Match. */
     bool som;
@@ -121,7 +122,7 @@ RoseVertex createVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset,
 RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent,
                         u32 minBound, u32 maxBound, u32 literalId,
                         size_t literalLength,
-                        const ue2::flat_set<ReportID> &reports) {
+                        const flat_set<ReportID> &reports) {
     assert(parent != RoseGraph::null_vertex());
 
     RoseGraph &g = build->g;
@@ -1525,8 +1526,7 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) {
     renumber_vertices(in);
     assert(validateKinds(in));
 
-    map<NGHolder *, vector<RoseInEdge> > graphs;
-    vector<NGHolder *> ordered_graphs; // Stored in first-encounter order.
+    insertion_ordered_map<NGHolder *, vector<RoseInEdge>> graphs;
 
     for (const auto &e : edges_range(in)) {
         if (!in[e].graph) {
@@ -1544,21 +1544,17 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) {
         NGHolder *h = in[e].graph.get();
 
         assert(isCorrectlyTopped(*h));
-        if (!contains(graphs, h)) {
-            ordered_graphs.push_back(h);
-        }
         graphs[h].push_back(e);
     }
 
-    assert(ordered_graphs.size() == graphs.size());
-
     vector<RoseInEdge> graph_edges;
 
-    for (auto h : ordered_graphs) {
+    for (const auto &m : graphs) {
+        NGHolder *h = m.first;
         if (!canImplementGraph(*h, prefilter, rm, cc)) {
             return false;
         }
-        insert(&graph_edges, graph_edges.end(), graphs[h]);
+        insert(&graph_edges, graph_edges.end(), m.second);
     }
 
     /* we are now past the point of no return. We can start making irreversible
@@ -1641,7 +1637,7 @@ bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
 }
 
 void RoseBuildImpl::add(bool anchored, bool eod, const ue2_literal &lit,
-                        const ue2::flat_set<ReportID> &reports) {
+                        const flat_set<ReportID> &reports) {
     assert(!reports.empty());
 
     if (cc.grey.floodAsPuffette && !anchored && !eod && is_flood(lit) &&
diff --git a/src/rose/rose_build_add_internal.h b/src/rose/rose_build_add_internal.h
index 569485a4..143f1dfa 100644
--- a/src/rose/rose_build_add_internal.h
+++ b/src/rose/rose_build_add_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,8 +31,7 @@
 
 #include "rose_graph.h"
 #include "ue2common.h"
-
-#include <set>
+#include "util/flat_containers.h"
 
 namespace ue2 {
 
@@ -41,8 +40,8 @@ class RoseBuildImpl;
 RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent,
                         u32 minBound, u32 maxBound, u32 literalId,
                         size_t literalLength,
-                        const ue2::flat_set<ReportID> &reports);
+                        const flat_set<ReportID> &reports);
 
 } // namespace ue2
 
-#endif
+#endif // ROSE_BUILD_ADD_INTERNAL_H
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index bd8eed0c..0a7e44c3 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -144,7 +144,7 @@ void findMaskLiteral(const vector<CharReach> &mask, bool streaming,
 }
 
 static
-bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> *cand) {
+bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> &cand) {
     for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) {
         char c = (char)i;
         bool nocase = myisupper(c) && cr.test(mytolower(c));
@@ -152,24 +152,25 @@ bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> *cand) {
             continue;
         }
 
-        if (cand->size() >= MAX_MASK_LITS) {
+        if (cand.size() >= MAX_MASK_LITS) {
             DEBUG_PRINTF("hit lit limit of %u\n", MAX_MASK_LITS);
             return false;
         }
 
-        cand->emplace_back(c, nocase);
+        cand.emplace_back(c, nocase);
     }
 
-    assert(cand->size() <= MAX_MASK_LITS);
-    return !cand->empty();
+    assert(cand.size() <= MAX_MASK_LITS);
+    return !cand.empty();
 }
 
 static
-bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> *cand) {
+bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> &curr,
+                         vector<ue2_literal> &cand) {
     DEBUG_PRINTF("expanding string with cr of %zu\n", cr.count());
-    DEBUG_PRINTF("  current cand list size %zu\n", cand->size());
+    DEBUG_PRINTF("  current cand list size %zu\n", cand.size());
 
-    vector<ue2_literal> curr;
+    curr.clear();
 
     for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) {
         char c = (char)i;
@@ -178,14 +179,14 @@ bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> *cand) {
             continue;
         }
 
-        for (const auto &lit : *cand) {
+        for (const auto &lit : cand) {
             if (curr.size() >= MAX_MASK_LITS) {
                 DEBUG_PRINTF("hit lit limit of %u\n", MAX_MASK_LITS);
                 return false;
             }
 
-            curr.emplace_back(c, nocase);
-            curr.back() += lit;
+            curr.push_back(lit);
+            curr.back().push_back(c, nocase);
         }
     }
 
@@ -196,7 +197,7 @@ bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> *cand) {
     }
 
     assert(curr.size() <= MAX_MASK_LITS);
-    cand->swap(curr);
+    cand.swap(curr);
     return true;
 }
 
@@ -213,6 +214,7 @@ u32 scoreFmlCandidates(const vector<ue2_literal> &cand) {
     u32 min_period = len;
 
     for (const auto &lit : cand) {
+        DEBUG_PRINTF("candidate: %s\n", dumpString(lit).c_str());
         u32 period = lit.length() - maxStringSelfOverlap(lit);
         min_period = min(min_period, period);
     }
@@ -238,31 +240,37 @@ bool findMaskLiterals(const vector<CharReach> &mask, vector<ue2_literal> *lit,
     *minBound = 0;
     *length = 0;
 
-    vector<ue2_literal> candidates, best_candidates;
+    vector<ue2_literal> candidates, best_candidates, curr_candidates;
     u32 best_score = 0;
     u32 best_minOffset = 0;
-    vector<CharReach>::const_iterator it, itb, ite;
-    for (it = itb = mask.begin(), ite = mask.end(); it != ite; ++it) {
+
+    for (auto it = mask.begin(); it != mask.end(); ++it) {
         candidates.clear();
-        if (!initFmlCandidates(*it, &candidates)) {
+        if (!initFmlCandidates(*it, candidates)) {
             DEBUG_PRINTF("failed to init\n");
             continue;
         }
         DEBUG_PRINTF("++\n");
-        vector<CharReach>::const_iterator jt = it;
-        while (jt != itb) {
+        auto jt = it;
+        while (jt != mask.begin()) {
             --jt;
             DEBUG_PRINTF("--\n");
-            if (!expandFmlCandidates(*jt, &candidates)) {
+            if (!expandFmlCandidates(*jt, curr_candidates, candidates)) {
                 DEBUG_PRINTF("expansion stopped\n");
                 break;
             }
         }
+
+        // Candidates have been expanded in reverse order.
+        for (auto &cand : candidates) {
+            cand = reverse_literal(cand);
+        }
+
         u32 score = scoreFmlCandidates(candidates);
         DEBUG_PRINTF("scored %u for literal set of size %zu\n", score,
                      candidates.size());
         if (!candidates.empty() && score >= best_score) {
-            best_minOffset = it - itb - candidates.back().length() + 1;
+            best_minOffset = it - mask.begin() - candidates.back().length() + 1;
             best_candidates.swap(candidates);
             best_score = score;
        }
@@ -277,11 +285,12 @@ bool findMaskLiterals(const vector<CharReach> &mask, vector<ue2_literal> *lit,
     *length = best_candidates.back().length();
 
     DEBUG_PRINTF("best minbound %u length %u\n", *minBound, *length);
-    for (const auto &cand : best_candidates) {
-        assert(cand.length() == *length);
-        lit->push_back(cand);
-    }
 
+    assert(all_of_in(best_candidates, [&](const ue2_literal &s) {
+        return s.length() == *length;
+    }));
+
+    *lit = std::move(best_candidates);
     return true;
 }
 
@@ -414,8 +423,8 @@ bool validateTransientMask(const vector<CharReach> &mask, bool anchored,
 
 static
 bool maskIsNeeded(const ue2_literal &lit, const NGHolder &g) {
-    ue2::flat_set<NFAVertex> curr = {g.accept};
-    ue2::flat_set<NFAVertex> next;
+    flat_set<NFAVertex> curr = {g.accept};
+    flat_set<NFAVertex> next;
 
     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
         const CharReach &cr = *it;
@@ -451,7 +460,7 @@ bool maskIsNeeded(const ue2_literal &lit, const NGHolder &g) {
 
 static
 void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
-                      const ue2::flat_set<ReportID> &reports, bool anchored,
+                      const flat_set<ReportID> &reports, bool anchored,
                       bool eod) {
     vector<ue2_literal> lits;
     u32 lit_minBound; /* minBound of each literal in lit */
@@ -516,7 +525,7 @@ void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
         ENSURE_AT_LEAST(&build.ematcher_region_size, mask.size());
     }
 
-    const ue2::flat_set<ReportID> no_reports;
+    const flat_set<ReportID> no_reports;
 
     for (const auto &lit : lits) {
         u32 lit_id = build.getLiteralId(lit, msk, cmp, delay, table);
@@ -553,7 +562,7 @@ void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
 }
 
 static
-unique_ptr<NGHolder> buildMaskRhs(const ue2::flat_set<ReportID> &reports,
+unique_ptr<NGHolder> buildMaskRhs(const flat_set<ReportID> &reports,
                                   const vector<CharReach> &mask,
                                   u32 suffix_len) {
     assert(suffix_len);
@@ -581,10 +590,9 @@ unique_ptr<NGHolder> buildMaskRhs(const ue2::flat_set<ReportID> &reports,
 }
 
 static
-void doAddMask(RoseBuildImpl &tbi, bool anchored,
-               const vector<CharReach> &mask, const ue2_literal &lit,
-               u32 prefix_len, u32 suffix_len,
-               const ue2::flat_set<ReportID> &reports) {
+void doAddMask(RoseBuildImpl &tbi, bool anchored, const vector<CharReach> &mask,
+               const ue2_literal &lit, u32 prefix_len, u32 suffix_len,
+               const flat_set<ReportID> &reports) {
     /* Note: bounds are relative to literal start */
     RoseInGraph ig;
     RoseInVertex s = add_vertex(RoseInVertexProps::makeStart(anchored), ig);
@@ -711,7 +719,7 @@ bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit,
 }
 
 bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
-                        const ue2::flat_set<ReportID> &reports) {
+                        const flat_set<ReportID> &reports) {
     if (validateTransientMask(mask, anchored, false, cc.grey)) {
         bool eod = false;
         addTransientMask(*this, mask, reports, anchored, eod);
@@ -734,14 +742,14 @@ bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
 }
 
 bool RoseBuildImpl::validateMask(const vector<CharReach> &mask,
-                                 UNUSED const ue2::flat_set<ReportID> &reports,
+                                 UNUSED const flat_set<ReportID> &reports,
                                  bool anchored, bool eod) const {
     return validateTransientMask(mask, anchored, eod, cc.grey);
 }
 
 static
 unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask,
-                                       const ue2::flat_set<ReportID> &reports,
+                                       const flat_set<ReportID> &reports,
                                        bool eod) {
     auto gp = ue2::make_unique<NGHolder>();
     NGHolder &g = *gp;
@@ -763,7 +771,7 @@ unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask,
 
 static
 bool addAnchoredMask(RoseBuildImpl &build, const vector<CharReach> &mask,
-                     const ue2::flat_set<ReportID> &reports, bool eod) {
+                     const flat_set<ReportID> &reports, bool eod) {
     if (!build.cc.grey.allowAnchoredAcyclic) {
         return false;
     }
@@ -775,8 +783,8 @@ bool addAnchoredMask(RoseBuildImpl &build, const vector<CharReach> &mask,
 }
 
 void RoseBuildImpl::addMask(const vector<CharReach> &mask,
-                            const ue2::flat_set<ReportID> &reports,
-                            bool anchored, bool eod) {
+                            const flat_set<ReportID> &reports, bool anchored,
+                            bool eod) {
     if (anchored && addAnchoredMask(*this, mask, reports, eod)) {
         DEBUG_PRINTF("added mask as anchored acyclic graph\n");
         return;
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index a2af160e..8ea07c95 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -49,11 +49,12 @@
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
 
 #include <map>
@@ -274,7 +275,7 @@ u32 anchoredStateSize(const anchored_matcher_info &atable) {
 
 namespace {
 
-typedef bitfield<ANCHORED_NFA_STATE_LIMIT> nfa_state_set;
+using nfa_state_set = bitfield<ANCHORED_NFA_STATE_LIMIT>;
 
 struct Holder_StateSet {
     Holder_StateSet() : wdelay(0) {}
@@ -285,19 +286,16 @@ struct Holder_StateSet {
     bool operator==(const Holder_StateSet &b) const {
         return wdelay == b.wdelay && wrap_state == b.wrap_state;
     }
-};
 
-size_t hash_value(const Holder_StateSet &s) {
-    size_t val = 0;
-    boost::hash_combine(val, s.wrap_state);
-    boost::hash_combine(val, s.wdelay);
-    return val;
-}
+    size_t hash() const {
+        return hash_all(wrap_state, wdelay);
+    }
+};
 
 class Automaton_Holder {
 public:
-    typedef Holder_StateSet StateSet;
-    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
+    using StateSet = Holder_StateSet;
+    using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
 
     explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
         for (auto v : vertices_range(g)) {
@@ -416,7 +414,7 @@ public:
 
 private:
     const NGHolder &g;
-    ue2::unordered_map<NFAVertex, u32> vertexToIndex;
+    unordered_map<NFAVertex, u32> vertexToIndex;
     vector<NFAVertex> indexToVertex;
     vector<CharReach> cr_by_index;
     StateSet init;
@@ -701,8 +699,8 @@ int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
 
     Automaton_Holder autom(h);
 
-    unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
-    if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
+    auto out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
+    if (determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
         return finalise_out(build, h, autom, move(out_dfa), remap);
     }
 
@@ -712,7 +710,7 @@ int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
 
 static
 void setReports(NGHolder &h, const map<NFAVertex, set<u32>> &reportMap,
-                const ue2::unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
+                const unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
     for (const auto &m : reportMap) {
         NFAVertex t = orig_to_copy.at(m.first);
         assert(!m.second.empty());
@@ -724,7 +722,7 @@ void setReports(NGHolder &h, const map<NFAVertex, set<u32>> &reportMap,
 int addAnchoredNFA(RoseBuildImpl &build, const NGHolder &wrapper,
                    const map<NFAVertex, set<u32>> &reportMap) {
     NGHolder h;
-    ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
+    unordered_map<NFAVertex, NFAVertex> orig_to_copy;
     cloneHolder(h, wrapper, &orig_to_copy);
     clear_in_edges(h.accept, h);
     clear_in_edges(h.acceptEod, h);
@@ -764,8 +762,8 @@ void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
         auto h = populate_holder(simple.first, exit_ids);
         Automaton_Holder autom(*h);
         auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
-        UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
-        assert(!rv);
+        UNUSED bool rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
+        assert(rv);
         rdfa->start_anchored = INIT_STATE;
         rdfa->start_floating = DEAD_STATE;
         rdfa->alpha_size = autom.alphasize;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 4d0793bf..9a546ae4 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -41,6 +41,7 @@
 #include "rose_build_long_lit.h"
 #include "rose_build_lookaround.h"
 #include "rose_build_matchers.h"
+#include "rose_build_misc.h"
 #include "rose_build_program.h"
 #include "rose_build_resources.h"
 #include "rose_build_scatter.h"
@@ -49,6 +50,7 @@
 #include "rose_internal.h"
 #include "rose_program.h"
 #include "hwlm/hwlm.h" /* engine types */
+#include "hwlm/hwlm_build.h"
 #include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
 #include "nfa/goughcompile.h"
@@ -84,6 +86,7 @@
 #include "util/container.h"
 #include "util/fatbit_build.h"
 #include "util/graph_range.h"
+#include "util/insertion_ordered.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/noncopyable.h"
@@ -144,8 +147,8 @@ struct build_context : noncopyable {
 
     /** \brief Simple cache of programs written to engine blob, used for
      * deduplication. */
-    ue2::unordered_map<RoseProgram, u32, RoseProgramHash,
-                       RoseProgramEquivalence> program_cache;
+    unordered_map<RoseProgram, u32, RoseProgramHash,
+                  RoseProgramEquivalence> program_cache;
 
     /** \brief State indices, for those roles that have them.
      * Each vertex present has a unique state index in the range
@@ -154,7 +157,7 @@ struct build_context : noncopyable {
 
     /** \brief Mapping from queue index to bytecode offset for built engines
      * that have already been pushed into the engine_blob. */
-    ue2::unordered_map<u32, u32> engineOffsets;
+    unordered_map<u32, u32> engineOffsets;
 
     /** \brief List of long literals (ones with CHECK_LONG_LIT instructions)
      * that need hash table support. */
@@ -392,13 +395,15 @@ void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount,
 
     so->activeLeafArray = curr_offset; /* TODO: limit size of array */
     curr_offset += mmbit_size(activeArrayCount);
+    so->activeLeafArray_size = mmbit_size(activeArrayCount);
 
     so->activeLeftArray = curr_offset; /* TODO: limit size of array */
+    curr_offset += mmbit_size(activeLeftCount);
     so->activeLeftArray_size = mmbit_size(activeLeftCount);
-    curr_offset += so->activeLeftArray_size;
 
     so->longLitState = curr_offset;
     curr_offset += longLitStreamStateRequired;
+    so->longLitState_size = longLitStreamStateRequired;
 
     // ONE WHOLE BYTE for each active leftfix with lag.
     so->leftfixLagTable = curr_offset;
@@ -419,6 +424,7 @@ void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount,
     // Exhaustion multibit.
     so->exhausted = curr_offset;
     curr_offset += mmbit_size(build.rm.numEkeys());
+    so->exhausted_size = mmbit_size(build.rm.numEkeys());
 
     // SOM locations and valid/writeable multibit structures.
     if (build.ssm.numSomSlots()) {
@@ -434,6 +440,7 @@ void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount,
         curr_offset += mmbit_size(build.ssm.numSomSlots());
         so->somWritable = curr_offset;
         curr_offset += mmbit_size(build.ssm.numSomSlots());
+        so->somMultibit_size = mmbit_size(build.ssm.numSomSlots());
     } else {
         // No SOM handling, avoid growing the stream state any further.
         so->somLocation = 0;
@@ -442,6 +449,7 @@ void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount,
     }
 
     // note: state space for mask nfas is allocated later
+    so->nfaStateBegin = curr_offset;
     so->end = curr_offset;
 }
 
@@ -1437,6 +1445,10 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
             continue;
         }
 
+        if (leftfix.haig()) {
+            continue;
+        }
+
         if (leftfix.graph() || leftfix.castle()) {
             leftfixes.emplace(leftfix, role_id);
             vertex_map[role_id].push_back(v);
@@ -1467,11 +1479,11 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
     RoseGraph &g = tbi.g;
     const CompileContext &cc = tbi.cc;
 
-    map<left_id, set<PredTopPair> > infixTriggers;
-    vector<left_id> order;
-    unordered_map<left_id, vector<RoseVertex> > succs;
+    map<left_id, set<PredTopPair>> infixTriggers;
     findInfixTriggers(tbi, &infixTriggers);
 
+    insertion_ordered_map<left_id, vector<RoseVertex>> succs;
+
     if (cc.grey.allowTamarama && cc.streaming && !do_prefix) {
         findExclusiveInfixes(tbi, bc, qif, infixTriggers, no_retrigger_queues);
     }
@@ -1510,10 +1522,6 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
             }
         }
 
-        if (!contains(succs, leftfix)) {
-            order.push_back(leftfix);
-        }
-
         succs[leftfix].push_back(v);
     }
 
@@ -1522,8 +1530,9 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
 
     map<left_id, eager_info> eager;
 
-    for (const left_id &leftfix : order) {
-        const auto &left_succs = succs[leftfix];
+    for (const auto &m : succs) {
+        const left_id &leftfix = m.first;
+        const auto &left_succs = m.second;
 
         rose_group squash_mask = tbi.rose_squash_masks.at(leftfix);
         eager_info ei;
@@ -1542,9 +1551,11 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
         eager.clear();
     }
 
-    for (const left_id &leftfix : order) {
+    for (const auto &m : succs) {
+        const left_id &leftfix = m.first;
+        const auto &left_succs = m.second;
         buildLeftfix(tbi, bc, do_prefix, qif.get_queue(), infixTriggers,
-                     no_retrigger_queues, eager_queues, eager, succs[leftfix],
+                     no_retrigger_queues, eager_queues, eager, left_succs,
                      leftfix);
     }
 
@@ -1874,6 +1885,10 @@ void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
             continue;
         }
 
+        if (s.haig()) {
+            continue;
+        }
+
         // Currently disable eod suffixes for exclusive analysis
         if (!tbi.isInETable(v) && (s.graph() || s.castle())) {
             DEBUG_PRINTF("assigning %p to id %u\n", s.graph(), role_id);
@@ -2038,7 +2053,7 @@ bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
 static
 void allocateStateSpace(const engine_info &eng_info, NfaInfo &nfa_info,
                         RoseStateOffsets *so, u32 *scratchStateSize,
-                        u32 *streamStateSize, u32 *transientStateSize) {
+                        u32 *transientStateSize) {
     u32 state_offset;
     if (eng_info.transient) {
         // Transient engines do not use stream state, but must have room in
@@ -2049,7 +2064,6 @@ void allocateStateSpace(const engine_info &eng_info, NfaInfo &nfa_info,
         // Pack NFA stream state on to the end of the Rose stream state.
         state_offset = so->end;
         so->end += eng_info.stream_size;
-        *streamStateSize += eng_info.stream_size;
     }
 
     nfa_info.stateOffset = state_offset;
@@ -2063,12 +2077,11 @@ void allocateStateSpace(const engine_info &eng_info, NfaInfo &nfa_info,
 static
 void updateNfaState(const build_context &bc, vector<NfaInfo> &nfa_infos,
                     RoseStateOffsets *so, u32 *scratchStateSize,
-                    u32 *streamStateSize, u32 *transientStateSize) {
+                    u32 *transientStateSize) {
     if (nfa_infos.empty()) {
         return;
     }
 
-    *streamStateSize = 0;
     *transientStateSize = 0;
     *scratchStateSize = 0;
 
@@ -2076,7 +2089,7 @@ void updateNfaState(const build_context &bc, vector<NfaInfo> &nfa_infos,
         NfaInfo &nfa_info = nfa_infos[qi];
         const auto &eng_info = bc.engine_info_by_queue.at(qi);
         allocateStateSpace(eng_info, nfa_info, so, scratchStateSize,
-                           streamStateSize, transientStateSize);
+                           transientStateSize);
     }
 }
 
@@ -2268,9 +2281,9 @@ bool hasMpvTrigger(const set<u32> &reports, const ReportManager &rm) {
 }
 
 static
-bool anyEndfixMpvTriggers(const RoseBuildImpl &tbi) {
-    const RoseGraph &g = tbi.g;
-    ue2::unordered_set<suffix_id> done;
+bool anyEndfixMpvTriggers(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    unordered_set<suffix_id> done;
 
     /* suffixes */
     for (auto v : vertices_range(g)) {
@@ -2282,14 +2295,14 @@ bool anyEndfixMpvTriggers(const RoseBuildImpl &tbi) {
         }
         done.insert(g[v].suffix);
 
-        if (hasMpvTrigger(all_reports(g[v].suffix), tbi.rm)) {
+        if (hasMpvTrigger(all_reports(g[v].suffix), build.rm)) {
             return true;
         }
     }
 
     /* outfixes */
-    for (const auto &out : tbi.outfixes) {
-        if (hasMpvTrigger(all_reports(out), tbi.rm)) {
+    for (const auto &out : build.outfixes) {
+        if (hasMpvTrigger(all_reports(out), build.rm)) {
             return true;
         }
     }
@@ -2328,6 +2341,7 @@ void addSomRevNfas(build_context &bc, RoseEngine &proto,
 
 static
 void recordResources(RoseResources &resources, const RoseBuildImpl &build,
+                     const vector<raw_dfa> &anchored_dfas,
                      const vector<LitFragment> &fragments) {
     if (!build.outfixes.empty()) {
         resources.has_outfixes = true;
@@ -2346,6 +2360,15 @@ void recordResources(RoseResources &resources, const RoseBuildImpl &build,
             break;
         }
     }
+
+    resources.has_anchored = !anchored_dfas.empty();
+    resources.has_anchored_multiple = anchored_dfas.size() > 1;
+    for (const auto &rdfa : anchored_dfas) {
+        if (rdfa.states.size() > 256) {
+            resources.has_anchored_large = true;
+        }
+    }
+
 }
 
 static
@@ -2490,7 +2513,7 @@ void writeNfaInfo(const RoseBuildImpl &build, build_context &bc,
     // Update state offsets to do with NFAs in proto and in the NfaInfo
     // structures.
     updateNfaState(bc, infos, &proto.stateOffsets, &proto.scratchStateSize,
-                   &proto.nfaStateSize, &proto.tStateSize);
+                   &proto.tStateSize);
 
     proto.nfaInfoOffset = bc.engine_blob.add_range(infos);
 }
@@ -2587,7 +2610,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
     const RoseGraph &g = tbi.g;
     const CompileContext &cc = tbi.cc;
 
-    ue2::unordered_set<u32> done_core;
+    unordered_set<u32> done_core;
 
     leftTable.resize(leftfixCount);
 
@@ -2672,30 +2695,22 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 static
 RoseProgram makeLiteralProgram(const RoseBuildImpl &build, build_context &bc,
                                ProgramBuild &prog_build, u32 lit_id,
-                               const map<u32, vector<RoseEdge>> &lit_edge_map,
+                               const vector<vector<RoseEdge>> &lit_edge_map,
                                bool is_anchored_replay_program) {
-    const vector<RoseEdge> no_edges;
-
     DEBUG_PRINTF("lit_id=%u\n", lit_id);
-    const vector<RoseEdge> *edges_ptr;
-    if (contains(lit_edge_map, lit_id)) {
-        edges_ptr = &lit_edge_map.at(lit_id);
-    } else {
-        /* literal may happen only in a delay context */
-        edges_ptr = &no_edges;
-    }
+    assert(lit_id < lit_edge_map.size());
 
     return makeLiteralProgram(build, bc.leftfix_info, bc.suffixes,
-                              bc.engine_info_by_queue,
-                              bc.roleStateIndices, prog_build, lit_id,
-                              *edges_ptr, is_anchored_replay_program);
+                              bc.engine_info_by_queue, bc.roleStateIndices,
+                              prog_build, lit_id, lit_edge_map.at(lit_id),
+                              is_anchored_replay_program);
 }
 
 static
 RoseProgram makeFragmentProgram(const RoseBuildImpl &build, build_context &bc,
                                ProgramBuild &prog_build,
                                const vector<u32> &lit_ids,
-                               const map<u32, vector<RoseEdge>> &lit_edge_map) {
+                               const vector<vector<RoseEdge>> &lit_edge_map) {
     assert(!lit_ids.empty());
 
     vector<RoseProgram> blocks;
@@ -2713,28 +2728,27 @@ RoseProgram makeFragmentProgram(const RoseBuildImpl &build, build_context &bc,
  * vertices with that literal ID.
  */
 static
-map<u32, vector<RoseEdge>> findEdgesByLiteral(const RoseBuildImpl &build) {
-    // Use a set of edges while building the map to cull duplicates.
-    map<u32, flat_set<RoseEdge>> unique_lit_edge_map;
+vector<vector<RoseEdge>> findEdgesByLiteral(const RoseBuildImpl &build) {
+    vector<vector<RoseEdge>> lit_edge_map(build.literals.size());
 
     const auto &g = build.g;
-    for (const auto &e : edges_range(g)) {
-        const auto &v = target(e, g);
+    for (const auto &v : vertices_range(g)) {
         for (const auto &lit_id : g[v].literals) {
-            unique_lit_edge_map[lit_id].insert(e);
+            assert(lit_id < lit_edge_map.size());
+            auto &edge_list = lit_edge_map.at(lit_id);
+            insert(&edge_list, edge_list.end(), in_edges(v, g));
         }
     }
 
-    // Build output map, sorting edges by (source, target) vertex index.
-    map<u32, vector<RoseEdge>> lit_edge_map;
-    for (const auto &m : unique_lit_edge_map) {
-        auto edge_list = vector<RoseEdge>(begin(m.second), end(m.second));
-        sort(begin(edge_list), end(edge_list),
-             [&g](const RoseEdge &a, const RoseEdge &b) {
-                 return tie(g[source(a, g)].index, g[target(a, g)].index) <
-                        tie(g[source(b, g)].index, g[target(b, g)].index);
-             });
-        lit_edge_map.emplace(m.first, std::move(edge_list));
+    // Sort edges in each edge list by (source, target) indices. This gives us
+    // less surprising ordering in program generation for a literal with many
+    // edges.
+    for (auto &edge_list : lit_edge_map) {
+        sort(begin(edge_list), end(edge_list), [&g](const RoseEdge &a,
+                                                    const RoseEdge &b) {
+            return tie(g[source(a, g)].index, g[target(a, g)].index) <
+                   tie(g[source(b, g)].index, g[target(b, g)].index);
+        });
     }
 
     return lit_edge_map;
@@ -2761,17 +2775,13 @@ bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) {
 }
 
 static
-rose_literal_id getFragment(const rose_literal_id &lit) {
-    if (lit.s.length() <= ROSE_SHORT_LITERAL_LEN_MAX) {
-        DEBUG_PRINTF("whole lit is frag\n");
-        return lit;
+rose_literal_id getFragment(rose_literal_id lit) {
+    if (lit.s.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
+        // Trim to last ROSE_SHORT_LITERAL_LEN_MAX bytes.
+        lit.s.erase(0, lit.s.length() - ROSE_SHORT_LITERAL_LEN_MAX);
     }
-
-    rose_literal_id frag = lit;
-    frag.s = frag.s.substr(frag.s.length() - ROSE_SHORT_LITERAL_LEN_MAX);
-
-    DEBUG_PRINTF("fragment: %s\n", dumpString(frag.s).c_str());
-    return frag;
+    DEBUG_PRINTF("fragment: %s\n", dumpString(lit.s).c_str());
+    return lit;
 }
 
 static
@@ -2803,7 +2813,7 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
         auto groups = info.group_mask;
 
         if (lit.s.length() < ROSE_SHORT_LITERAL_LEN_MAX) {
-            fragments.emplace_back(frag_id, groups, lit_id);
+            fragments.emplace_back(frag_id, lit.s, groups, lit_id);
             frag_id++;
             continue;
         }
@@ -2816,10 +2826,11 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
     }
 
     for (auto &m : frag_info) {
+        auto &lit = m.first;
         auto &fi = m.second;
         DEBUG_PRINTF("frag %s -> ids: %s\n", dumpString(m.first.s).c_str(),
                      as_string_list(fi.lit_ids).c_str());
-        fragments.emplace_back(frag_id, fi.groups, move(fi.lit_ids));
+        fragments.emplace_back(frag_id, lit.s, fi.groups, move(fi.lit_ids));
         frag_id++;
         assert(frag_id == fragments.size());
     }
@@ -2827,33 +2838,196 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
     return fragments;
 }
 
+static
+void buildIncludedIdMap(unordered_map<u32, pair<u32, u8>> &includedIdMap,
+                        const LitProto *litProto) {
+    if (!litProto) {
+        return;
+    }
+    const auto &proto = *litProto->hwlmProto;
+    for (const auto &lit : proto.lits) {
+        if (contains(includedIdMap, lit.id)) {
+            const auto &included_id = includedIdMap[lit.id].first;
+            const auto &squash = includedIdMap[lit.id].second;
+            // The squash behavior should be the same for the same literal
+            // in different literal matchers.
+            if (lit.included_id != included_id ||
+                lit.squash != squash) {
+                includedIdMap[lit.id] = make_pair(INVALID_LIT_ID, 0);
+                DEBUG_PRINTF("find different included info for the"
+                             " same literal\n");
+            }
+        } else if (lit.included_id != INVALID_LIT_ID) {
+            includedIdMap[lit.id] = make_pair(lit.included_id, lit.squash);
+        } else {
+            includedIdMap[lit.id] = make_pair(INVALID_LIT_ID, 0);
+        }
+    }
+}
+
+static
+void findInclusionGroups(vector<LitFragment> &fragments,
+                         LitProto *fproto, LitProto *drproto,
+                         LitProto *eproto, LitProto *sbproto) {
+    unordered_map<u32, pair<u32, u8>> includedIdMap;
+    unordered_map<u32, pair<u32, u8>> includedDelayIdMap;
+    buildIncludedIdMap(includedIdMap, fproto);
+    buildIncludedIdMap(includedDelayIdMap, drproto);
+    buildIncludedIdMap(includedIdMap, eproto);
+    buildIncludedIdMap(includedIdMap, sbproto);
+
+    size_t fragNum = fragments.size();
+    vector<u32> candidates;
+    for (size_t j = 0; j < fragNum; j++) {
+        DEBUG_PRINTF("frag id %lu\n", j);
+        u32 id = j;
+        if (contains(includedIdMap, id) ||
+            contains(includedDelayIdMap, id)) {
+            candidates.push_back(j);
+            DEBUG_PRINTF("find candidate\n");
+        }
+    }
+
+    for (const auto &c : candidates) {
+        auto &frag = fragments[c];
+        u32 id = c;
+        if (contains(includedIdMap, id) &&
+            includedIdMap[id].first != INVALID_LIT_ID) {
+            const auto &childId = includedIdMap[id];
+            frag.included_frag_id = childId.first;
+            frag.squash = childId.second;
+            DEBUG_PRINTF("frag id %u child frag id %u\n", c,
+                         frag.included_frag_id);
+        }
+
+        if (contains(includedDelayIdMap, id) &&
+            includedDelayIdMap[id].first != INVALID_LIT_ID) {
+            const auto &childId = includedDelayIdMap[id];
+            frag.included_delay_frag_id = childId.first;
+            frag.delay_squash = childId.second;
+
+            DEBUG_PRINTF("delay frag id %u child frag id %u\n", c,
+                             frag.included_delay_frag_id);
+        }
+    }
+}
+
+static
+void buildFragmentPrograms(const RoseBuildImpl &build,
+                           vector<LitFragment> &fragments,
+                           build_context &bc, ProgramBuild &prog_build,
+                           const vector<vector<RoseEdge>> &lit_edge_map) {
+    // Sort fragments based on literal length and case info to build
+    // included literal programs before their parent programs.
+    vector<LitFragment> ordered_fragments(fragments);
+    stable_sort(begin(ordered_fragments), end(ordered_fragments),
+         [](const LitFragment &a, const LitFragment &b) {
+             auto len1 = a.s.length();
+             auto caseful1 = !a.s.any_nocase();
+             auto len2 = b.s.length();
+             auto caseful2 = !b.s.any_nocase();
+             return tie(len1, caseful1) < tie(len2, caseful2);
+         });
+
+    for (auto &frag : ordered_fragments) {
+        auto &pfrag = fragments[frag.fragment_id];
+        DEBUG_PRINTF("frag_id=%u, lit_ids=[%s]\n", pfrag.fragment_id,
+                     as_string_list(pfrag.lit_ids).c_str());
+
+        auto lit_prog = makeFragmentProgram(build, bc, prog_build,
+                                            pfrag.lit_ids, lit_edge_map);
+        if (pfrag.included_frag_id != INVALID_FRAG_ID &&
+            !lit_prog.empty()) {
+            auto &cfrag = fragments[pfrag.included_frag_id];
+            assert(pfrag.s.length() >= cfrag.s.length() &&
+                   !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
+            u32 child_offset = cfrag.lit_program_offset;
+            DEBUG_PRINTF("child %u offset %u\n", cfrag.fragment_id,
+                         child_offset);
+            addIncludedJumpProgram(lit_prog, child_offset, pfrag.squash);
+        }
+        pfrag.lit_program_offset = writeProgram(bc, move(lit_prog));
+
+        // We only do delayed rebuild in streaming mode.
+        if (!build.cc.streaming) {
+            continue;
+        }
+
+        auto rebuild_prog = makeDelayRebuildProgram(build, prog_build,
+                                                    pfrag.lit_ids);
+        if (pfrag.included_delay_frag_id != INVALID_FRAG_ID &&
+            !rebuild_prog.empty()) {
+            auto &cfrag = fragments[pfrag.included_delay_frag_id];
+            assert(pfrag.s.length() >= cfrag.s.length() &&
+                   !pfrag.s.any_nocase() >= !cfrag.s.any_nocase());
+            u32 child_offset = cfrag.delay_program_offset;
+            DEBUG_PRINTF("child %u offset %u\n", cfrag.fragment_id,
+                         child_offset);
+            addIncludedJumpProgram(rebuild_prog, child_offset,
+                                   pfrag.delay_squash);
+        }
+        pfrag.delay_program_offset = writeProgram(bc, move(rebuild_prog));
+    }
+}
+
+static
+void updateLitProtoProgramOffset(vector<LitFragment> &fragments,
+                                 LitProto &litProto, bool delay) {
+    auto &proto = *litProto.hwlmProto;
+    for (auto &lit : proto.lits) {
+        auto fragId = lit.id;
+        auto &frag = fragments[fragId];
+        if (delay) {
+            DEBUG_PRINTF("delay_program_offset:%u\n",
+                         frag.delay_program_offset);
+            lit.id = frag.delay_program_offset;
+        } else {
+            DEBUG_PRINTF("lit_program_offset:%u\n",
+                         frag.lit_program_offset);
+            lit.id = frag.lit_program_offset;
+        }
+    }
+}
+
+static
+void updateLitProgramOffset(vector<LitFragment> &fragments,
+                            LitProto *fproto, LitProto *drproto,
+                            LitProto *eproto, LitProto *sbproto) {
+    if (fproto) {
+        updateLitProtoProgramOffset(fragments, *fproto, false);
+    }
+
+    if (drproto) {
+        updateLitProtoProgramOffset(fragments, *drproto, true);
+    }
+
+    if (eproto) {
+        updateLitProtoProgramOffset(fragments, *eproto, false);
+    }
+
+    if (sbproto) {
+        updateLitProtoProgramOffset(fragments, *sbproto, false);
+    }
+}
+
 /**
  * \brief Build the interpreter programs for each literal.
  */
 static
 void buildLiteralPrograms(const RoseBuildImpl &build,
                           vector<LitFragment> &fragments, build_context &bc,
-                          ProgramBuild &prog_build) {
+                          ProgramBuild &prog_build, LitProto *fproto,
+                          LitProto *drproto, LitProto *eproto,
+                          LitProto *sbproto) {
     DEBUG_PRINTF("%zu fragments\n", fragments.size());
     auto lit_edge_map = findEdgesByLiteral(build);
 
-    for (auto &frag : fragments) {
-        DEBUG_PRINTF("frag_id=%u, lit_ids=[%s]\n", frag.fragment_id,
-                     as_string_list(frag.lit_ids).c_str());
+    findInclusionGroups(fragments, fproto, drproto, eproto, sbproto);
 
-        auto lit_prog = makeFragmentProgram(build, bc, prog_build, frag.lit_ids,
-                                            lit_edge_map);
-        frag.lit_program_offset = writeProgram(bc, move(lit_prog));
+    buildFragmentPrograms(build, fragments, bc, prog_build, lit_edge_map);
 
-        // We only do delayed rebuild in streaming mode.
-        if (!build.cc.streaming) {
-            continue;
-        }
-
-        auto rebuild_prog = makeDelayRebuildProgram(build, prog_build,
-                                                    frag.lit_ids);
-        frag.delay_program_offset = writeProgram(bc, move(rebuild_prog));
-    }
+    // update literal program offsets for literal matcher prototypes
+    updateLitProgramOffset(fragments, fproto, drproto, eproto, sbproto);
 }
 
 /**
@@ -3259,6 +3433,7 @@ u32 writeEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
 
 static
 bytecode_ptr<RoseEngine> addSmallWriteEngine(const RoseBuildImpl &build,
+                                             const RoseResources &res,
                                              bytecode_ptr<RoseEngine> rose) {
     assert(rose);
 
@@ -3267,7 +3442,7 @@ bytecode_ptr<RoseEngine> addSmallWriteEngine(const RoseBuildImpl &build,
         return rose;
     }
 
-    u32 qual = roseQuality(rose.get());
+    u32 qual = roseQuality(res, rose.get());
     auto smwr_engine = build.smwr.build(qual);
     if (!smwr_engine) {
         DEBUG_PRINTF("no smwr built\n");
@@ -3407,10 +3582,7 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     build_context bc;
     u32 floatingMinLiteralMatchOffset
         = findMinFloatingLiteralMatch(*this, anchored_dfas);
-    recordResources(bc.resources, *this, fragments);
-    if (!anchored_dfas.empty()) {
-        bc.resources.has_anchored = true;
-    }
+    recordResources(bc.resources, *this, anchored_dfas, fragments);
     bc.needs_mpv_catchup = needsMpvCatchup(*this);
 
     makeBoundaryPrograms(*this, bc, boundary, dboundary, proto.boundary);
@@ -3470,7 +3642,24 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     tie(proto.delayProgramOffset, proto.delay_count) =
         writeDelayPrograms(*this, fragments, bc, prog_build);
 
-    buildLiteralPrograms(*this, fragments, bc, prog_build);
+    // Build floating HWLM matcher prototype.
+    rose_group fgroups = 0;
+    auto fproto = buildFloatingMatcherProto(*this, fragments,
+                                            longLitLengthThreshold,
+                                            &fgroups, &historyRequired);
+
+    // Build delay rebuild HWLM matcher prototype.
+    auto drproto = buildDelayRebuildMatcherProto(*this, fragments,
+                                                 longLitLengthThreshold);
+
+    // Build EOD-anchored HWLM matcher prototype.
+    auto eproto = buildEodAnchoredMatcherProto(*this, fragments);
+
+    // Build small-block HWLM matcher prototype.
+    auto sbproto = buildSmallBlockMatcherProto(*this, fragments);
+
+    buildLiteralPrograms(*this, fragments, bc, prog_build, fproto.get(),
+                         drproto.get(), eproto.get(), sbproto.get());
 
     auto eod_prog = makeEodProgram(*this, bc, prog_build, eodNfaIterOffset);
     proto.eodProgramOffset = writeProgram(bc, move(eod_prog));
@@ -3497,29 +3686,26 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     }
 
     // Build floating HWLM matcher.
-    rose_group fgroups = 0;
-    auto ftable = buildFloatingMatcher(*this, fragments, longLitLengthThreshold,
-                                       &fgroups, &historyRequired);
+    auto ftable = buildHWLMMatcher(*this, fproto.get());
     if (ftable) {
         proto.fmatcherOffset = bc.engine_blob.add(ftable);
         bc.resources.has_floating = true;
     }
 
     // Build delay rebuild HWLM matcher.
-    auto drtable = buildDelayRebuildMatcher(*this, fragments,
-                                            longLitLengthThreshold);
+    auto drtable = buildHWLMMatcher(*this, drproto.get());
     if (drtable) {
         proto.drmatcherOffset = bc.engine_blob.add(drtable);
     }
 
     // Build EOD-anchored HWLM matcher.
-    auto etable = buildEodAnchoredMatcher(*this, fragments);
+    auto etable = buildHWLMMatcher(*this, eproto.get());
     if (etable) {
         proto.ematcherOffset = bc.engine_blob.add(etable);
     }
 
     // Build small-block HWLM matcher.
-    auto sbtable = buildSmallBlockMatcher(*this, fragments);
+    auto sbtable = buildHWLMMatcher(*this, sbproto.get());
     if (sbtable) {
         proto.sbmatcherOffset = bc.engine_blob.add(sbtable);
     }
@@ -3618,7 +3804,6 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     proto.totalNumLiterals = verify_u32(literal_info.size());
     proto.asize = verify_u32(atable.size());
     proto.ematcherRegionSize = ematcher_region_size;
-    proto.longLitStreamState = verify_u32(longLitStreamStateRequired);
 
     proto.size = currOffset;
 
@@ -3636,7 +3821,7 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     bc.engine_blob.write_bytes(engine.get());
 
     // Add a small write engine if appropriate.
-    engine = addSmallWriteEngine(*this, move(engine));
+    engine = addSmallWriteEngine(*this, bc.resources, move(engine));
 
     DEBUG_PRINTF("rose done %p\n", engine.get());
 
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index a85a784f..59bab3b1 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -38,7 +38,6 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 
 #include <map>
@@ -55,7 +54,7 @@ namespace ue2 {
 
 static
 void makeCastle(LeftEngInfo &left,
-               unordered_map<const NGHolder *, shared_ptr<CastleProto>> &cache) {
+            unordered_map<const NGHolder *, shared_ptr<CastleProto>> &cache) {
     if (left.dfa || left.haig || left.castle) {
         return;
     }
@@ -85,7 +84,7 @@ void makeCastle(LeftEngInfo &left,
 
 static
 void makeCastleSuffix(RoseBuildImpl &tbi, RoseVertex v,
-        ue2::unordered_map<const NGHolder *, shared_ptr<CastleProto> > &cache) {
+            unordered_map<const NGHolder *, shared_ptr<CastleProto>> &cache) {
     RoseSuffixInfo &suffix = tbi.g[v].suffix;
     if (!suffix.graph) {
         return;
@@ -298,8 +297,8 @@ bool unmakeCastles(RoseBuildImpl &tbi) {
 }
 
 void remapCastleTops(RoseBuildImpl &tbi) {
-    ue2::unordered_map<CastleProto *, vector<RoseVertex> > rose_castles;
-    ue2::unordered_map<CastleProto *, vector<RoseVertex> > suffix_castles;
+    unordered_map<CastleProto *, vector<RoseVertex>> rose_castles;
+    unordered_map<CastleProto *, vector<RoseVertex>> suffix_castles;
 
     RoseGraph &g = tbi.g;
     for (auto v : vertices_range(g)) {
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 96241e39..1cf3bbe6 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -61,10 +61,10 @@
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
 
@@ -1087,13 +1087,13 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &build, const left_id &left,
     assert(left.graph());
     const NGHolder &h = *left.graph();
 
-    ue2::flat_set<NFAVertex> all_states;
+    flat_set<NFAVertex> all_states;
     insert(&all_states, vertices(h));
     assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
     DEBUG_PRINTF("removing sds\n");
     all_states.erase(h.startDs);
 
-    ue2::flat_set<NFAVertex> states;
+    flat_set<NFAVertex> states;
 
     /* check each pred literal to see if they all kill previous graph
      * state */
@@ -1639,7 +1639,7 @@ static
 bool danglingVertexRef(RoseBuildImpl &tbi) {
     RoseGraph::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(tbi.g);
-    const ue2::unordered_set<RoseVertex> valid_vertices(vi, ve);
+    const unordered_set<RoseVertex> valid_vertices(vi, ve);
 
     if (!contains(valid_vertices, tbi.anchored_root)) {
         DEBUG_PRINTF("anchored root vertex %zu not in graph\n",
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index 0c1f4338..33351099 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -58,8 +58,9 @@
 #include <queue>
 #include <set>
 #include <string>
-#include <vector>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include <boost/range/adaptor/map.hpp>
 
@@ -84,7 +85,7 @@ size_t suffixFloodLen(const ue2_literal &s) {
 
     const ue2_literal::elem &c = s.back();
     auto it = find_if(s.rbegin(), s.rend(),
-                      bind2nd(not_equal_to<ue2_literal::elem>(), c));
+                      [&c](const ue2_literal::elem &e) { return e != c; });
     return distance(s.rbegin(), it);
 }
 
@@ -561,7 +562,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     DEBUG_PRINTF("woot?\n");
 
     shared_ptr<NGHolder> h_new = make_shared<NGHolder>();
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
     vector<NFAVertex> exits_vec;
     insert(&exits_vec, exits_vec.end(), exits);
     splitRHS(h, exits_vec, h_new.get(), &rhs_map);
diff --git a/src/rose/rose_build_dedupe.cpp b/src/rose/rose_build_dedupe.cpp
index d3e72313..04144f56 100644
--- a/src/rose/rose_build_dedupe.cpp
+++ b/src/rose/rose_build_dedupe.cpp
@@ -39,7 +39,7 @@ using namespace std;
 namespace ue2 {
 
 static
-bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
+bool requiresDedupe(const NGHolder &h, const flat_set<ReportID> &reports,
                     const Grey &grey) {
     /* TODO: tighten */
     NFAVertex seen_vert = NGHolder::null_vertex();
@@ -83,10 +83,10 @@ class RoseDedupeAuxImpl : public RoseDedupeAux {
 public:
     explicit RoseDedupeAuxImpl(const RoseBuildImpl &build_in);
     bool requiresDedupeSupport(
-        const ue2::flat_set<ReportID> &reports) const override;
+        const flat_set<ReportID> &reports) const override;
 
 private:
-    bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
+    bool hasSafeMultiReports(const flat_set<ReportID> &reports) const;
 
     const RoseBuildImpl &build;
     map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index b527db6c..b70112f2 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -48,6 +48,7 @@
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 #include "util/graph_range.h"
 #include "util/multibit.h"
 #include "util/multibit_build.h"
@@ -681,10 +682,17 @@ vector<u32> sparseIterValues(const mmbit_sparse_iter *it, u32 num_bits) {
         return keys;
     }
 
-    vector<u8> bits(mmbit_size(num_bits), u8{0xff}); // All bits on.
-    vector<mmbit_sparse_state> state(MAX_SPARSE_ITER_STATES);
-
+    // Populate a multibit structure with all-ones. Note that the multibit
+    // runtime assumes that it is always safe to read 8 bytes, so we must
+    // over-allocate for smaller sizes.
+    const size_t num_bytes = mmbit_size(num_bits);
+    vector<u8> bits(max(size_t{8}, num_bytes), u8{0xff}); // All bits on.
     const u8 *b = bits.data();
+    if (num_bytes < 8) {
+        b += 8 - num_bytes;
+    }
+
+    vector<mmbit_sparse_state> state(MAX_SPARSE_ITER_STATES);
     mmbit_sparse_state *s = state.data();
 
     u32 idx = 0;
@@ -1455,6 +1463,12 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(INCLUDED_JUMP) {
+                os << "    child_offset " << ri->child_offset << endl;
+                os << "    squash " << (u32)ri->squash << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
         default:
             os << "  UNKNOWN (code " << int{code} << ")" << endl;
             os << "  <stopping>" << endl;
@@ -1672,13 +1686,12 @@ void dumpComponentInfo(const RoseEngine *t, const string &base) {
     }
 }
 
-
 static
 void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
-    FILE *f = fopen((base +"rose_components.csv").c_str(), "w");
+    StdioFile f(base + "/rose_components.csv", "w");
 
-    fprintf(f, "Index, Offset,Engine Type,States,Stream State,Bytecode Size,"
-            "Kind,Notes\n");
+    fprintf(f, "Index, Offset,Engine Type,States,Stream State,"
+               "Bytecode Size,Kind,Notes\n");
 
     for (u32 i = 0; i < t->queueCount; i++) {
         const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
@@ -1738,14 +1751,11 @@ void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
                 n->nPositions, n->streamStateSize, n->length,
                 to_string(kind).c_str(), notes.str().c_str());
     }
-    fclose(f);
 }
 
 static
 void dumpExhaust(const RoseEngine *t, const string &base) {
-    stringstream sstxt;
-    sstxt << base << "rose_exhaust.txt";
-    FILE *f = fopen(sstxt.str().c_str(), "w");
+    StdioFile f(base + "/rose_exhaust.csv", "w");
 
     const NfaInfo *infos
         = (const NfaInfo *)((const char *)t + t->nfaInfoOffset);
@@ -1771,8 +1781,6 @@ void dumpExhaust(const RoseEngine *t, const string &base) {
 
         fprintf(f, "\n");
     }
-
-    fclose(f);
 }
 
 static
@@ -1790,9 +1798,8 @@ void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         if (dump_raw) {
             stringstream ssraw;
             ssraw << base << "rose_nfa_" << i << ".raw";
-            FILE *f = fopen(ssraw.str().c_str(), "w");
+            StdioFile f(ssraw.str(), "w");
             fwrite(n, 1, n->length, f);
-            fclose(f);
         }
     }
 }
@@ -1840,9 +1847,8 @@ void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         if (dump_raw) {
             stringstream ssraw;
             ssraw << base << "som_rev_nfa_" << i << ".raw";
-            FILE *f = fopen(ssraw.str().c_str(), "w");
+            StdioFile f(ssraw.str(), "w");
             fwrite(n, 1, n->length, f);
-            fclose(f);
         }
     }
 }
@@ -2020,15 +2026,17 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
 
     fprintf(f, "state space required : %u bytes\n", t->stateOffsets.end);
     fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
-    fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
+    fprintf(f, " - exhaustion vector : %u bytes\n",
+            t->stateOffsets.exhausted_size);
     fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
     fprintf(f, " - long lit matcher  : %u bytes\n", t->longLitStreamState);
     fprintf(f, " - active array      : %u bytes\n",
-            mmbit_size(t->activeArrayCount));
+            t->stateOffsets.activeLeafArray_size);
     fprintf(f, " - active rose       : %u bytes\n",
-            mmbit_size(t->activeLeftCount));
+            t->stateOffsets.activeLeftArray_size);
     fprintf(f, " - anchored state    : %u bytes\n", t->anchorStateSize);
-    fprintf(f, " - nfa state         : %u bytes\n", t->nfaStateSize);
+    fprintf(f, " - nfa state         : %u bytes\n",
+            t->stateOffsets.end - t->stateOffsets.nfaStateBegin);
     fprintf(f, " - (trans. nfa state): %u bytes\n", t->tStateSize);
     fprintf(f, " - one whole bytes   : %u bytes\n",
             t->stateOffsets.anchorState - t->stateOffsets.leftfixLagTable);
@@ -2060,26 +2068,6 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
         dumpAnchoredStats(atable, f);
     }
 
-    if (ftable) {
-        fprintf(f, "\nFloating literal matcher stats:\n\n");
-        hwlmPrintStats(ftable, f);
-    }
-
-    if (drtable) {
-        fprintf(f, "\nDelay Rebuild literal matcher stats:\n\n");
-        hwlmPrintStats(drtable, f);
-    }
-
-    if (etable) {
-        fprintf(f, "\nEOD-anchored literal matcher stats:\n\n");
-        hwlmPrintStats(etable, f);
-    }
-
-    if (sbtable) {
-        fprintf(f, "\nSmall-block literal matcher stats:\n\n");
-        hwlmPrintStats(sbtable, f);
-    }
-
     dumpLongLiteralTable(t, f);
 }
 
@@ -2112,7 +2100,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, rolesWithStateCount);
     DUMP_U32(t, stateSize);
     DUMP_U32(t, anchorStateSize);
-    DUMP_U32(t, nfaStateSize);
     DUMP_U32(t, tStateSize);
     DUMP_U32(t, smallWriteOffset);
     DUMP_U32(t, amatcherOffset);
@@ -2162,7 +2149,9 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, delayRebuildLength);
     DUMP_U32(t, stateOffsets.history);
     DUMP_U32(t, stateOffsets.exhausted);
+    DUMP_U32(t, stateOffsets.exhausted_size);
     DUMP_U32(t, stateOffsets.activeLeafArray);
+    DUMP_U32(t, stateOffsets.activeLeafArray_size);
     DUMP_U32(t, stateOffsets.activeLeftArray);
     DUMP_U32(t, stateOffsets.activeLeftArray_size);
     DUMP_U32(t, stateOffsets.leftfixLagTable);
@@ -2170,9 +2159,12 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, stateOffsets.groups);
     DUMP_U32(t, stateOffsets.groups_size);
     DUMP_U32(t, stateOffsets.longLitState);
+    DUMP_U32(t, stateOffsets.longLitState_size);
     DUMP_U32(t, stateOffsets.somLocation);
     DUMP_U32(t, stateOffsets.somValid);
     DUMP_U32(t, stateOffsets.somWritable);
+    DUMP_U32(t, stateOffsets.somMultibit_size);
+    DUMP_U32(t, stateOffsets.nfaStateBegin);
     DUMP_U32(t, stateOffsets.end);
     DUMP_U32(t, boundary.reportEodOffset);
     DUMP_U32(t, boundary.reportZeroOffset);
@@ -2188,7 +2180,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, ematcherRegionSize);
     DUMP_U32(t, somRevCount);
     DUMP_U32(t, somRevOffsetOffset);
-    DUMP_U32(t, longLitStreamState);
     fprintf(f, "}\n");
     fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
 }
@@ -2214,6 +2205,25 @@ void roseDumpPrograms(const vector<LitFragment> &fragments, const RoseEngine *t,
     dumpRoseDelayPrograms(t, base + "/rose_delay_programs.txt");
 }
 
+static
+void roseDumpLiteralMatchers(const RoseEngine *t, const string &base) {
+    if (const HWLM *hwlm = getFloatingMatcher(t)) {
+        hwlmGenerateDumpFiles(hwlm, base + "/lit_table_floating");
+    }
+
+    if (const HWLM *hwlm = getDelayRebuildMatcher(t)) {
+        hwlmGenerateDumpFiles(hwlm, base + "/lit_table_delay_rebuild");
+    }
+
+    if (const HWLM *hwlm = getEodMatcher(t)) {
+        hwlmGenerateDumpFiles(hwlm, base + "/lit_table_eod");
+    }
+
+    if (const HWLM *hwlm = getSmallBlockMatcher(t)) {
+        hwlmGenerateDumpFiles(hwlm, base + "/lit_table_small_block");
+    }
+}
+
 void dumpRose(const RoseBuildImpl &build, const vector<LitFragment> &fragments,
               const map<left_id, u32> &leftfix_queue_map,
               const map<suffix_id, u32> &suffix_queue_map,
@@ -2224,24 +2234,19 @@ void dumpRose(const RoseBuildImpl &build, const vector<LitFragment> &fragments,
         return;
     }
 
-    stringstream ss;
-    ss << grey.dumpPath << "rose.txt";
-
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "/rose.txt", "w");
 
     if (!t) {
         fprintf(f, "<< no rose >>\n");
-        fclose(f);
         return;
     }
 
     // Dump Rose table info
     roseDumpText(t, f);
 
-    fclose(f);
-
     roseDumpComponents(t, false, grey.dumpPath);
     roseDumpPrograms(fragments, t, grey.dumpPath);
+    roseDumpLiteralMatchers(t, grey.dumpPath);
 
     // Graph.
     dumpRoseGraph(build, t, fragments, leftfix_queue_map, suffix_queue_map,
@@ -2250,9 +2255,8 @@ void dumpRose(const RoseBuildImpl &build, const vector<LitFragment> &fragments,
     // Literals
     dumpRoseLiterals(build, fragments, grey);
 
-    f = fopen((grey.dumpPath + "/rose_struct.txt").c_str(), "w");
+    f = StdioFile(grey.dumpPath + "/rose_struct.txt", "w");
     roseDumpStructRaw(t, f);
-    fclose(f);
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_engine_blob.h b/src/rose/rose_build_engine_blob.h
index 3aa501b4..da4e355d 100644
--- a/src/rose/rose_build_engine_blob.h
+++ b/src/rose/rose_build_engine_blob.h
@@ -36,13 +36,14 @@
 #include "util/bytecode_ptr.h"
 #include "util/charreach.h"
 #include "util/container.h"
+#include "util/hash.h"
 #include "util/multibit_build.h"
 #include "util/noncopyable.h"
-#include "util/ue2_containers.h"
 #include "util/verify_types.h"
+#include "util/unordered.h"
 
-#include <vector>
 #include <type_traits>
+#include <vector>
 
 namespace ue2 {
 
@@ -56,9 +57,10 @@ struct lookaround_info : noncopyable {
     u32 get_offset_of(const std::vector<s8> &look, RoseEngineBlob &blob);
 
 private:
-    unordered_map<std::vector<std::vector<CharReach>>, u32> multi_cache;
-    unordered_map<std::vector<s8>, u32> lcache;
-    unordered_map<std::vector<CharReach>, u32> rcache;
+    using Path = std::vector<CharReach>;
+    ue2_unordered_map<std::vector<Path>, u32> multi_cache;
+    ue2_unordered_map<std::vector<s8>, u32> lcache;
+    ue2_unordered_map<Path, u32> rcache;
 };
 
 class RoseEngineBlob : noncopyable {
@@ -160,7 +162,7 @@ private:
     }
 
     /** \brief Cache of previously-written sparse iterators. */
-    unordered_map<std::vector<mmbit_sparse_iter>, u32> cached_iters;
+    ue2_unordered_map<std::vector<mmbit_sparse_iter>, u32> cached_iters;
 
     /**
      * \brief Contents of the Rose bytecode immediately following the
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
index e91cc297..6a5a710d 100644
--- a/src/rose/rose_build_exclusive.cpp
+++ b/src/rose/rose_build_exclusive.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "ue2common.h"
-
 #include "rose_build_exclusive.h"
+
+#include "ue2common.h"
 #include "rose_build_merge.h"
 #include "nfa/castlecompile.h"
 #include "nfagraph/ng_execute.h"
@@ -37,6 +37,7 @@
 #include "util/clique.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/make_unique.h"
 
@@ -87,7 +88,7 @@ vector<RoleChunk<role_id>> divideIntoChunks(const RoseBuildImpl &build,
 
 /* add prefix literals to engine graph */
 static
-bool addPrefixLiterals(NGHolder &h, ue2::unordered_set<u32> &tailId,
+bool addPrefixLiterals(NGHolder &h, unordered_set<u32> &tailId,
                        const vector<vector<CharReach>> &triggers) {
     DEBUG_PRINTF("add literals to graph\n");
 
@@ -196,8 +197,8 @@ vector<CharReach> findStartPos(const CharReach &cr1,
 template<typename role_id>
 static
 bool isExclusive(const NGHolder &h,
-                 const u32 num, ue2::unordered_set<u32> &tailId,
-                 map<u32, ue2::unordered_set<u32>> &skipList,
+                 const u32 num, unordered_set<u32> &tailId,
+                 map<u32, unordered_set<u32>> &skipList,
                  const RoleInfo<role_id> &role1,
                  const RoleInfo<role_id> &role2) {
     const u32 id1 = role1.id;
@@ -218,29 +219,29 @@ bool isExclusive(const NGHolder &h,
     const auto &cr1 = role1.cr;
     if (overlaps(cr1, role2.last_cr)) {
         CharReach cr = cr1 | role1.prefix_cr;
+        flat_set<NFAVertex> states;
         for (const auto &lit : triggers2) {
             auto lit1 = findStartPos(cr, lit);
             if (lit1.empty()) {
                 continue;
             }
-            u32 lower_bound = 0;
-            if (lit1.size() < lit.size()) {
-                lower_bound = ~0U;
-            }
 
-            ue2::flat_set<NFAVertex> states;
-            for (const auto &v : vertices_range(h)) {
-                if (h[v].index >= lower_bound || h[v].index < 2) {
-                    states.insert(v);
-                }
+            states.clear();
+
+            if (lit1.size() < lit.size()) {
+                // Only starts.
+                states.insert(h.start);
+                states.insert(h.startDs);
+            } else {
+                // All vertices.
+                insert(&states, vertices(h));
             }
 
             auto activeStates = execute_graph(h, lit1, states);
-            // Check if has only literal states are on
+            // Check if only literal states are on
             for (const auto &s : activeStates) {
-                u32 stateId = h[s].index;
-                if ((stateId > 1 && stateId <= num) ||
-                    contains(tailId, stateId)) {
+                if ((!is_any_start(s, h) && h[s].index <= num) ||
+                    contains(tailId, h[s].index)) {
                     skipList[id2].insert(id1);
                     return false;
                 }
@@ -253,12 +254,12 @@ bool isExclusive(const NGHolder &h,
 
 template<typename role_id>
 static
-ue2::unordered_set<u32> checkExclusivity(const NGHolder &h,
-                            const u32 num, ue2::unordered_set<u32> &tailId,
-                            map<u32, ue2::unordered_set<u32>> &skipList,
-                            const RoleInfo<role_id> &role1,
-                            const RoleChunk<role_id> &roleChunk) {
-    ue2::unordered_set<u32> info;
+unordered_set<u32> checkExclusivity(const NGHolder &h,
+                                    const u32 num, unordered_set<u32> &tailId,
+                                    map<u32, unordered_set<u32>> &skipList,
+                                    const RoleInfo<role_id> &role1,
+                                    const RoleChunk<role_id> &roleChunk) {
+    unordered_set<u32> info;
     const u32 id1 = role1.id;
     for (const auto &role2 : roleChunk.roles) {
         const u32 id2 = role2.id;
@@ -316,7 +317,7 @@ void findCliques(const map<u32, set<u32>> &exclusiveGroups,
 
 static
 map<u32, set<u32>> findExclusiveGroups(const RoseBuildImpl &build,
-            const map<u32, ue2::unordered_set<u32>> &exclusiveInfo,
+            const map<u32, unordered_set<u32>> &exclusiveInfo,
             const map<u32, vector<RoseVertex>> &vertex_map,
             const bool is_infix) {
     map<u32, set<u32>> exclusiveGroups;
@@ -396,10 +397,10 @@ void exclusiveAnalysis(const RoseBuildImpl &build,
                vector<vector<u32>> &exclusive_roles, const bool is_infix) {
     const auto &chunks = divideIntoChunks(build, roleInfoSet);
     DEBUG_PRINTF("Exclusivity analysis entry\n");
-    map<u32, ue2::unordered_set<u32>> exclusiveInfo;
+    map<u32, unordered_set<u32>> exclusiveInfo;
 
     for (const auto &roleChunk : chunks) {
-        map<u32, ue2::unordered_set<u32>> skipList;
+        map<u32, unordered_set<u32>> skipList;
         for (const auto &role1 : roleChunk.roles) {
             const u32 id1 = role1.id;
             const role_id &s1 = role1.role;
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
index 3ab5eb78..ada64b80 100644
--- a/src/rose/rose_build_groups.h
+++ b/src/rose/rose_build_groups.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,11 +35,12 @@
 #define ROSE_BUILD_GROUPS_H
 
 #include "rose_build_impl.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>
 
 namespace ue2 {
 
-unordered_map<RoseVertex, rose_group>
+std::unordered_map<RoseVertex, rose_group>
 getVertexGroupMap(const RoseBuildImpl &build);
 
 rose_group getSquashableGroups(const RoseBuildImpl &build);
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 13f1cfc9..900aee6c 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -39,11 +39,12 @@
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_revacc.h"
 #include "util/bytecode_ptr.h"
+#include "util/flat_containers.h"
 #include "util/hash.h"
 #include "util/order_check.h"
 #include "util/queue_index_factory.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
 
 #include <deque>
@@ -177,7 +178,6 @@ depth findMinWidth(const suffix_id &s);
 depth findMaxWidth(const suffix_id &s);
 depth findMinWidth(const suffix_id &s, u32 top);
 depth findMaxWidth(const suffix_id &s, u32 top);
-size_t hash_value(const suffix_id &s);
 
 /** \brief represents an engine to the left of a rose role */
 struct left_id {
@@ -254,15 +254,15 @@ private:
 };
 
 std::set<u32> all_tops(const left_id &r);
+std::set<ReportID> all_reports(const left_id &left);
 bool isAnchored(const left_id &r);
 depth findMinWidth(const left_id &r);
 depth findMaxWidth(const left_id &r);
 u32 num_tops(const left_id &r);
-size_t hash_value(const left_id &r);
 
 struct rose_literal_info {
-    ue2::flat_set<u32> delayed_ids;
-    ue2::flat_set<RoseVertex> vertices;
+    flat_set<u32> delayed_ids;
+    flat_set<RoseVertex> vertices;
     rose_group group_mask = 0;
     u32 undelayed_id = MO_INVALID_IDX;
     bool squash_group = false;
@@ -306,6 +306,10 @@ struct rose_literal_id {
         return s == b.s && msk == b.msk && cmp == b.cmp && table == b.table &&
                delay == b.delay && distinctiveness == b.distinctiveness;
     }
+
+    size_t hash() const {
+        return hash_all(s, msk, cmp, table, delay, distinctiveness);
+    }
 };
 
 static inline
@@ -319,12 +323,6 @@ bool operator<(const rose_literal_id &a, const rose_literal_id &b) {
     return 0;
 }
 
-inline
-size_t hash_value(const rose_literal_id &lit) {
-    return hash_all(lit.s, lit.msk, lit.cmp, lit.table, lit.delay,
-                    lit.distinctiveness);
-}
-
 class RoseLiteralMap {
     /**
      * \brief Main storage for literals.
@@ -336,7 +334,7 @@ class RoseLiteralMap {
     std::deque<rose_literal_id> lits;
 
     /** \brief Quick-lookup index from literal -> index in lits. */
-    unordered_map<rose_literal_id, u32> lits_index;
+    ue2_unordered_map<rose_literal_id, u32> lits_index;
 
 public:
     std::pair<u32, bool> insert(const rose_literal_id &lit) {
@@ -504,7 +502,7 @@ public:
 
     // Adds a single literal.
     void add(bool anchored, bool eod, const ue2_literal &lit,
-             const ue2::flat_set<ReportID> &ids) override;
+             const flat_set<ReportID> &ids) override;
 
     bool addRose(const RoseInGraph &ig, bool prefilter) override;
     bool addSombeRose(const RoseInGraph &ig) override;
@@ -517,15 +515,15 @@ public:
 
     // Returns true if we were able to add it as a mask
     bool add(bool anchored, const std::vector<CharReach> &mask,
-             const ue2::flat_set<ReportID> &reports) override;
+             const flat_set<ReportID> &reports) override;
 
     bool addAnchoredAcyclic(const NGHolder &graph) override;
 
     bool validateMask(const std::vector<CharReach> &mask,
-                      const ue2::flat_set<ReportID> &reports, bool anchored,
+                      const flat_set<ReportID> &reports, bool anchored,
                       bool eod) const override;
     void addMask(const std::vector<CharReach> &mask,
-                 const ue2::flat_set<ReportID> &reports, bool anchored,
+                 const flat_set<ReportID> &reports, bool anchored,
                  bool eod) override;
 
     // Construct a runtime implementation.
@@ -627,8 +625,8 @@ public:
      * overlap calculation in history assignment. */
     std::map<u32, rose_literal_id> anchoredLitSuffix;
 
-    unordered_set<left_id> transient;
-    unordered_map<left_id, rose_group> rose_squash_masks;
+    ue2_unordered_set<left_id> transient;
+    ue2_unordered_map<left_id, rose_group> rose_squash_masks;
 
     std::vector<OutfixInfo> outfixes;
 
@@ -689,4 +687,22 @@ bool canImplementGraphs(const RoseBuildImpl &tbi);
 
 } // namespace ue2
 
+namespace std {
+
+template<>
+struct hash<ue2::left_id> {
+    size_t operator()(const ue2::left_id &l) const {
+        return l.hash();
+    }
+};
+
+template<>
+struct hash<ue2::suffix_id> {
+    size_t operator()(const ue2::suffix_id &s) const {
+        return s.hash();
+    }
+};
+
+} // namespace std
+
 #endif /* ROSE_BUILD_IMPL_H */
diff --git a/src/rose/rose_build_infix.cpp b/src/rose/rose_build_infix.cpp
index 4bbb3525..80e12542 100644
--- a/src/rose/rose_build_infix.cpp
+++ b/src/rose/rose_build_infix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,10 +36,12 @@
 #include "rose/rose_build_impl.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/graph.h"
-#include "util/ue2_containers.h"
+#include "util/hash.h"
 #include "util/ue2string.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <set>
@@ -51,7 +53,7 @@ namespace ue2 {
 static
 bool couldEndLiteral(const ue2_literal &s, NFAVertex initial,
                      const NGHolder &h) {
-    ue2::flat_set<NFAVertex> curr, next;
+    flat_set<NFAVertex> curr, next;
     curr.insert(initial);
 
     for (auto it = s.rbegin(), ite = s.rend(); it != ite; ++it) {
@@ -82,9 +84,10 @@ bool couldEndLiteral(const ue2_literal &s, NFAVertex initial,
     return true;
 }
 
+using EdgeCache = ue2_unordered_set<pair<NFAVertex, NFAVertex>>;
+
 static
-void contractVertex(NGHolder &g, NFAVertex v,
-                    ue2::unordered_set<pair<NFAVertex, NFAVertex>> &all_edges) {
+void contractVertex(NGHolder &g, NFAVertex v, EdgeCache &all_edges) {
     for (auto u : inv_adjacent_vertices_range(v, g)) {
         if (u == v) {
             continue; // self-edge
@@ -144,8 +147,9 @@ u32 findMaxLiteralMatches(const NGHolder &h, const set<ue2_literal> &lits) {
     cloneHolder(g, h);
     vector<NFAVertex> dead;
 
-    // The set of all edges in the graph is used for existence checks in contractVertex.
-    ue2::unordered_set<pair<NFAVertex, NFAVertex>> all_edges;
+    // The set of all edges in the graph is used for existence checks in
+    // contractVertex.
+    EdgeCache all_edges;
     for (const auto &e : edges_range(g)) {
         all_edges.emplace(source(e, g), target(e, g));
     }
diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp
index b00c36be..8af08298 100644
--- a/src/rose/rose_build_instructions.cpp
+++ b/src/rose/rose_build_instructions.cpp
@@ -636,4 +636,12 @@ void RoseInstrCheckMultipathShufti64::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrIncludedJump::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->child_offset = child_offset;
+    inst->squash = squash;
+}
+
 }
diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h
index 025f6a67..d3ede29b 100644
--- a/src/rose/rose_build_instructions.h
+++ b/src/rose/rose_build_instructions.h
@@ -39,6 +39,7 @@
 
 #include "rose_build_lookaround.h"
 #include "rose_build_program.h"
+#include "util/hash.h"
 #include "util/verify_types.h"
 
 namespace ue2 {
@@ -65,7 +66,7 @@ public:
     /** \brief Length of the bytecode instruction in bytes. */
     virtual size_t byte_length() const = 0;
 
-    using OffsetMap = unordered_map<const RoseInstruction *, u32>;
+    using OffsetMap = std::unordered_map<const RoseInstruction *, u32>;
 
     /**
      * \brief Writes a concrete implementation of this instruction.
@@ -149,6 +150,10 @@ private:
     }
 };
 
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+constexpr RoseInstructionCode
+    RoseInstrBase<Opcode, ImplType, RoseInstrType>::opcode;
+
 /**
  * \brief Refinement of RoseInstrBase to use for instructions that have
  * just a single target member, called "target".
@@ -190,7 +195,7 @@ public:
     virtual bool operator==(const RoseInstrType &) const { return true; }
 
     size_t hash() const override {
-        return boost::hash_value(static_cast<int>(Opcode));
+        return hash_all(Opcode);
     }
 
     bool equiv_to(const RoseInstrType &, const RoseInstruction::OffsetMap &,
@@ -222,7 +227,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups, anch_id);
+        return hash_all(opcode, groups, anch_id);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -251,7 +256,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), min_offset);
+        return hash_all(opcode, min_offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -278,7 +283,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
+        return hash_all(opcode, groups);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -305,7 +310,7 @@ public:
     }
 
     size_t hash() const override {
-        return boost::hash_value(static_cast<int>(opcode));
+        return hash_all(opcode);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -335,7 +340,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), min_bound, max_bound);
+        return hash_all(opcode, min_bound, max_bound);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -364,7 +369,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), key);
+        return hash_all(opcode, key);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -395,7 +400,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), offset, reach);
+        return hash_all(opcode, offset, reach);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -426,7 +431,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), look);
+        return hash_all(opcode, look);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -462,8 +467,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
-                        offset);
+        return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -501,8 +505,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
-                        offset);
+        return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -539,8 +542,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, negation,
-                        offset);
+        return hash_all(opcode, and_mask, cmp_mask, negation, offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -581,8 +583,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), nib_mask,
-                        bucket_select_mask, neg_mask, offset);
+        return hash_all(opcode, nib_mask, bucket_select_mask, neg_mask, offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -626,8 +627,8 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, neg_mask, offset);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask,
+                        offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -671,8 +672,8 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, neg_mask, offset);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask,
+                        offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -720,9 +721,8 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask_hi, bucket_select_mask_lo,
-                        neg_mask, offset);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask_hi,
+                        bucket_select_mask_lo, neg_mask, offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -758,7 +758,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag, report);
+        return hash_all(opcode, queue, lag, report);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -791,7 +791,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag, report);
+        return hash_all(opcode, queue, lag, report);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -820,7 +820,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), delay, index);
+        return hash_all(opcode, delay, index);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -861,7 +861,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), distance);
+        return hash_all(opcode, distance);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -889,7 +889,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag);
+        return hash_all(opcode, queue, lag);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -917,7 +917,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+        return hash_all(opcode, som.type, som.onmatch);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -953,7 +953,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), cancel, queue, event);
+        return hash_all(opcode, cancel, queue, event);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -981,7 +981,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, event);
+        return hash_all(opcode, queue, event);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1013,8 +1013,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey,
-                        offset_adjust);
+        return hash_all(opcode, quash_som, dkey, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1049,8 +1048,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey,
-                        offset_adjust);
+        return hash_all(opcode, quash_som, dkey, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1081,7 +1079,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), event, top_squash_distance);
+        return hash_all(opcode, event, top_squash_distance);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1110,7 +1108,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+        return hash_all(opcode, som.type, som.onmatch);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1138,7 +1136,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+        return hash_all(opcode, som.type, som.onmatch);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1165,7 +1163,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+        return hash_all(opcode, onmatch, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1196,7 +1194,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+        return hash_all(opcode, onmatch, offset_adjust, ekey);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1225,7 +1223,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+        return hash_all(opcode, onmatch, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1256,7 +1254,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+        return hash_all(opcode, onmatch, offset_adjust, ekey);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1293,8 +1291,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey, onmatch,
-                        offset_adjust);
+        return hash_all(opcode, quash_som, dkey, onmatch, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1324,7 +1321,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+        return hash_all(opcode, onmatch, offset_adjust);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1352,7 +1349,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), ekey);
+        return hash_all(opcode, ekey);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1384,7 +1381,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), end_adj, min_length);
+        return hash_all(opcode, end_adj, min_length);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1410,7 +1407,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index);
+        return hash_all(opcode, index);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1436,7 +1433,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
+        return hash_all(opcode, groups);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1462,7 +1459,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
+        return hash_all(opcode, groups);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1490,7 +1487,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index);
+        return hash_all(opcode, index);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1522,9 +1519,9 @@ public:
     }
 
     size_t hash() const override {
-        size_t v = hash_all(static_cast<int>(opcode), num_keys);
+        size_t v = hash_all(opcode, num_keys);
         for (const u32 &key : jump_table | boost::adaptors::map_keys) {
-            boost::hash_combine(v, key);
+            hash_combine(v, key);
         }
         return v;
     }
@@ -1594,7 +1591,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), state);
+        return hash_all(opcode, state);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1638,7 +1635,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), num_keys, keys);
+        return hash_all(opcode, num_keys, keys);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1665,7 +1662,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), iter_offset);
+        return hash_all(opcode, iter_offset);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1709,7 +1706,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
+        return hash_all(opcode, literal);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1741,7 +1738,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
+        return hash_all(opcode, literal);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1772,7 +1769,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
+        return hash_all(opcode, literal);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1804,7 +1801,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
+        return hash_all(opcode, literal);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1849,8 +1846,7 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), multi_look, last_start,
-                        start_mask);
+        return hash_all(opcode, multi_look, last_start, start_mask);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1905,9 +1901,9 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), nib_mask,
-                        bucket_select_mask, data_select_mask, hi_bits_mask,
-                        lo_bits_mask, neg_mask, base_offset, last_start);
+        return hash_all(opcode, nib_mask, bucket_select_mask, data_select_mask,
+                        hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
+                        last_start);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -1968,9 +1964,9 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, data_select_mask, hi_bits_mask,
-                        lo_bits_mask, neg_mask, base_offset, last_start);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask,
+                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                        base_offset, last_start);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -2035,10 +2031,9 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask_hi, bucket_select_mask_lo,
-                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
-                        base_offset, last_start);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask_hi,
+                        bucket_select_mask_lo, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -2100,9 +2095,9 @@ public:
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, data_select_mask, hi_bits_mask,
-                        lo_bits_mask, neg_mask, base_offset, last_start);
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask,
+                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                        base_offset, last_start);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -2121,6 +2116,34 @@ public:
     }
 };
 
+class RoseInstrIncludedJump
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_INCLUDED_JUMP,
+                                    ROSE_STRUCT_INCLUDED_JUMP,
+                                    RoseInstrIncludedJump> {
+public:
+    u32 child_offset;
+    u8 squash;
+
+    RoseInstrIncludedJump(u32 child_offset_in, u8 squash_in)
+        : child_offset(child_offset_in), squash(squash_in) {}
+
+    bool operator==(const RoseInstrIncludedJump &ri) const {
+        return child_offset == ri.child_offset && squash == ri.squash;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), child_offset, squash);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrIncludedJump &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return child_offset == ri.child_offset && squash == ri.squash;
+    }
+};
+
 class RoseInstrEnd
     : public RoseInstrBaseTrivial<ROSE_INSTR_END, ROSE_STRUCT_END,
                                   RoseInstrEnd> {
diff --git a/src/rose/rose_build_long_lit.cpp b/src/rose/rose_build_long_lit.cpp
index 7ebf73ec..45a2eb27 100644
--- a/src/rose/rose_build_long_lit.cpp
+++ b/src/rose/rose_build_long_lit.cpp
@@ -44,7 +44,7 @@ using namespace std;
 namespace ue2 {
 
 /** \brief Minimum size for a non-empty hash table. Must be a power of two. */
-static constexpr u32 MIN_HASH_TABLE_SIZE = 128;
+static constexpr size_t MIN_HASH_TABLE_SIZE = 128;
 
 /** \brief Maximum load factor (between zero and one) for a hash table. */
 static constexpr double MAX_HASH_TABLE_LOAD = 0.7;
@@ -167,30 +167,69 @@ vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits,
     return bloom;
 }
 
-static
+static UNUSED
 size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) {
     return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) {
         return ent.str_offset != 0;
     });
 }
 
-static
+static UNUSED
 double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) {
     return (double)hashTableOccupancy(tab) / (double)(tab.size());
 }
 
+using LitOffsetVector = small_vector<pair<u32, u32>, 1>;
+
 static
-vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
-                                            size_t max_len,
-                                            const vector<u32> &litToOffsetVal,
-                                            size_t numEntries, bool nocase) {
+vector<RoseLongLitHashEntry> buildHashTable(
+               size_t max_len, const vector<u32> &litToOffsetVal,
+               const map<u32, LitOffsetVector> &hashToLitOffPairs,
+               size_t numEntries) {
     vector<RoseLongLitHashEntry> tab(numEntries, {0,0});
 
     if (!numEntries) {
         return tab;
     }
 
-    map<u32, vector<pair<u32, u32>>> hashToLitOffPairs;
+    for (const auto &m : hashToLitOffPairs) {
+        u32 hash = m.first;
+        const LitOffsetVector &d = m.second;
+
+        u32 bucket = hash % numEntries;
+
+        // Placement via linear probing.
+        for (const auto &lit_offset : d) {
+            while (tab[bucket].str_offset != 0) {
+                bucket++;
+                if (bucket == numEntries) {
+                    bucket = 0;
+                }
+            }
+
+            u32 lit_id = lit_offset.first;
+            u32 offset = lit_offset.second;
+
+            DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
+                         lit_id, offset, bucket);
+
+            auto &entry = tab[bucket];
+            entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
+            assert(entry.str_offset != 0);
+            entry.str_len = offset + max_len;
+        }
+    }
+
+    DEBUG_PRINTF("hash table occupancy %zu of %zu entries\n",
+                 hashTableOccupancy(tab), numEntries);
+
+    return tab;
+}
+
+static
+map<u32, LitOffsetVector> computeLitHashes(const vector<ue2_case_string> &lits,
+                                           size_t max_len, bool nocase) {
+    map<u32, LitOffsetVector> hashToLitOffPairs;
 
     for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) {
         const ue2_case_string &lit = lits[lit_id];
@@ -205,8 +244,10 @@ vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
     }
 
     for (auto &m : hashToLitOffPairs) {
-        u32 hash = m.first;
-        vector<pair<u32, u32>> &d = m.second;
+        LitOffsetVector &d = m.second;
+        if (d.size() == 1) {
+            continue;
+        }
 
         // Sort by (offset, string) so that we'll be able to remove identical
         // string prefixes.
@@ -240,36 +281,9 @@ vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
                         }
                         return a.first < b.first;
                     });
-
-        u32 bucket = hash % numEntries;
-
-        // Placement via linear probing.
-        for (const auto &lit_offset : d) {
-            while (tab[bucket].str_offset != 0) {
-                bucket++;
-                if (bucket == numEntries) {
-                    bucket = 0;
-                }
-            }
-
-            u32 lit_id = lit_offset.first;
-            u32 offset = lit_offset.second;
-
-            DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
-                         lit_id, offset, bucket);
-
-            auto &entry = tab[bucket];
-            entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
-            assert(entry.str_offset != 0);
-            entry.str_len = offset + max_len;
-        }
     }
 
-    DEBUG_PRINTF("%s hash table occupancy %zu of %zu entries\n",
-                 nocase ? "nocase" : "caseful", hashTableOccupancy(tab),
-                 numEntries);
-
-    return tab;
+    return hashToLitOffPairs;
 }
 
 static
@@ -277,24 +291,21 @@ vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits,
                                            size_t max_len,
                                            const vector<u32> &litToOffsetVal,
                                            u32 numPositions, bool nocase) {
-    vector<RoseLongLitHashEntry> tab;
+    // Compute lit substring hashes.
+    const auto hashToLitOffPairs = computeLitHashes(lits, max_len, nocase);
 
-    // Note: for the hash table, we must always have at least enough entries
-    // for the number of hashable positions.
-    size_t num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE,
-    numPositions));
+    // Compute the size of the hash table: we need enough entries to satisfy
+    // our max load constraint, and it must be a power of two.
+    size_t num_entries = (double)numPositions / MAX_HASH_TABLE_LOAD + 1;
+    num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, num_entries));
+
+    auto tab = buildHashTable(max_len, litToOffsetVal, hashToLitOffPairs,
+                              num_entries);
+    DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
+                 nocase ? "nocase" : "caseful", num_entries,
+                 hashTableLoad(tab));
+    assert(hashTableLoad(tab) < MAX_HASH_TABLE_LOAD);
 
-    for (;;) {
-        tab = buildHashTable(lits, max_len, litToOffsetVal, num_entries,
-                             nocase);
-        DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
-                     nocase ? "nocase" : "caseful", num_entries,
-                     hashTableLoad(tab));
-        if (hashTableLoad(tab) < MAX_HASH_TABLE_LOAD) {
-            break;
-        }
-        num_entries *= 2;
-    }
     return tab;
 }
 
@@ -383,7 +394,7 @@ u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
     if (info.nocase.num_literals) {
         bloom_nocase = makeBloomFilter(lits, max_len, true);
         tab_nocase = makeHashTable(lits, max_len, litToOffsetVal,
-                                 info.nocase.hashed_positions, true);
+                                   info.nocase.hashed_positions, true);
     }
 
     size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob));
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index a46a1aeb..7cc1c584 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -40,7 +40,7 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/verify_types.h"
 
 #include <cstdlib>
@@ -79,7 +79,7 @@ string dump(const map<s32, CharReach> &look) {
 
 static
 void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
-    ue2::flat_set<NFAVertex> curr, next;
+    flat_set<NFAVertex> curr, next;
 
     // Consider only successors of start with the required top.
     for (const auto &e : out_edges_range(g.start, g)) {
@@ -116,7 +116,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
 static
 void getBackwardReach(const NGHolder &g, ReportID report, u32 lag,
                       map<s32, CharReach> &look) {
-    ue2::flat_set<NFAVertex> curr, next;
+    flat_set<NFAVertex> curr, next;
 
     for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
         if (contains(g[v].reports, report)) {
@@ -187,7 +187,7 @@ void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) {
         return;
     }
 
-    ue2::flat_set<dstate_id_t> curr, next;
+    flat_set<dstate_id_t> curr, next;
     curr.insert(rdfa.start_anchored);
 
     for (u32 i = 0; i < MAX_FWD_LEN && !curr.empty(); i++) {
@@ -485,19 +485,17 @@ vector<LookProto> findLiteralReach(const rose_literal_id &lit) {
 }
 
 static
-map<s32, CharReach> findLiteralReach(const RoseBuildImpl &build,
-                                     const RoseVertex v) {
+vector<LookProto> findLiteralReach(const RoseBuildImpl &build,
+                                   const RoseVertex v) {
     bool first = true;
-    map<s32, CharReach> look;
+    vector<LookProto> look;
 
     for (u32 lit_id : build.g[v].literals) {
         const rose_literal_id &lit = build.literals.at(lit_id);
         auto lit_look = findLiteralReach(lit);
 
         if (first) {
-            for (auto &p : lit_look) {
-                look.emplace(p.offset, p.reach);
-            }
+            look = std::move(lit_look);
             first = false;
             continue;
         }
@@ -512,22 +510,21 @@ map<s32, CharReach> findLiteralReach(const RoseBuildImpl &build,
                 look.erase(it, end(look));
                 break;
             }
-            if (it->first < jt->offset) {
+            if (it->offset < jt->offset) {
                 // Offset is present in look but not in lit_look, erase.
                 it = look.erase(it);
-            } else if (it->first > jt->offset) {
+            } else if (it->offset > jt->offset) {
                 // Offset is preset in lit_look but not in look, ignore.
                 ++jt;
             } else {
                 // Offset is present in both, union its reach with look.
-                it->second |= jt->reach;
+                it->reach |= jt->reach;
                 ++it;
                 ++jt;
             }
         }
     }
 
-    DEBUG_PRINTF("lit lookaround: %s\n", dump(look).c_str());
     return look;
 }
 
@@ -541,11 +538,11 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
     DEBUG_PRINTF("pre-trim lookaround: %s\n", dump(look).c_str());
 
     for (const auto &m : findLiteralReach(build, v)) {
-        auto it = look.find(m.first);
+        auto it = look.find(m.offset);
         if (it == end(look)) {
             continue;
         }
-        if (m.second.isSubsetOf(it->second)) {
+        if (m.reach.isSubsetOf(it->second)) {
             DEBUG_PRINTF("can trim entry at %d\n", it->first);
             look.erase(it);
         }
@@ -849,7 +846,7 @@ void mergeLookaround(vector<LookEntry> &lookaround,
     }
 
     // Don't merge lookarounds at offsets we already have entries for.
-    ue2::flat_set<s8> offsets;
+    flat_set<s8> offsets;
     for (const auto &e : lookaround) {
         offsets.insert(e.offset);
     }
diff --git a/src/rose/rose_build_lookaround.h b/src/rose/rose_build_lookaround.h
index aea87ccf..70d4217c 100644
--- a/src/rose/rose_build_lookaround.h
+++ b/src/rose/rose_build_lookaround.h
@@ -33,6 +33,7 @@
 #define ROSE_ROSE_BUILD_LOOKAROUND_H
 
 #include "rose_graph.h"
+#include "util/hash.h"
 
 #include <vector>
 
@@ -58,14 +59,6 @@ struct LookEntry {
     }
 };
 
-static inline
-size_t hash_value(const LookEntry &l) {
-    size_t val = 0;
-    boost::hash_combine(val, l.offset);
-    boost::hash_combine(val, l.reach);
-    return val;
-}
-
 void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
                          std::vector<LookEntry> &look_more);
 
@@ -83,4 +76,15 @@ void mergeLookaround(std::vector<LookEntry> &lookaround,
 
 } // namespace ue2
 
+namespace std {
+
+template<>
+struct hash<ue2::LookEntry> {
+    size_t operator()(const ue2::LookEntry &l) const {
+        return ue2::hash_all(l.offset, l.reach);
+    }
+};
+
+} // namespace std
+
 #endif // ROSE_ROSE_BUILD_LOOKAROUND_H
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 682a87c3..2c302a85 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -46,6 +46,7 @@
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/dump_charclass.h"
+#include "util/make_unique.h"
 #include "util/report.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
@@ -55,6 +56,7 @@
 #include <sstream>
 
 #include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 using namespace std;
 using boost::adaptors::map_values;
@@ -63,7 +65,7 @@ namespace ue2 {
 
 static const size_t MAX_ACCEL_STRING_LEN = 16;
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
 static UNUSED
 string dumpMask(const vector<u8> &v) {
     ostringstream oss;
@@ -231,28 +233,12 @@ bool maskFromPreds(const RoseBuildImpl &build, const rose_literal_id &id,
 }
 
 static
-bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
-                     const rose_literal_info &info, const RoseVertex v,
-                     vector<u8> &msk, vector<u8> &cmp) {
+bool addSurroundingMask(const RoseBuildImpl &build, const rose_literal_id &id,
+                        const RoseVertex v, vector<u8> &msk, vector<u8> &cmp) {
     // Start with zero masks.
     msk.assign(HWLM_MASKLEN, 0);
     cmp.assign(HWLM_MASKLEN, 0);
 
-    // Masks can come from literal benefits (for mixed-case literals).
-    if (info.requires_benefits) {
-        assert(mixed_sensitivity(id.s));
-
-        size_t j = 0;
-        for (ue2_literal::const_reverse_iterator it = id.s.rbegin(),
-                                                 ite = id.s.rend();
-             it != ite && j < HWLM_MASKLEN; ++it, ++j) {
-            size_t offset = HWLM_MASKLEN - j - 1;
-            const CharReach &cr = *it;
-            make_and_cmp_mask(cr, &msk[offset], &cmp[offset]);
-        }
-        return true;
-    }
-
     const LeftEngInfo &left = build.g[v].left;
     if (left && left.lag < HWLM_MASKLEN) {
         if (maskFromLeft(left, msk, cmp)) {
@@ -293,9 +279,9 @@ bool hamsterMaskCombine(vector<u8> &msk, vector<u8> &cmp,
 }
 
 static
-bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
-                     const rose_literal_info &info,
-                     vector<u8> &msk, vector<u8> &cmp) {
+bool addSurroundingMask(const RoseBuildImpl &build, const rose_literal_id &id,
+                        const rose_literal_info &info, vector<u8> &msk,
+                        vector<u8> &cmp) {
     if (!build.cc.grey.roseHamsterMasks) {
         return false;
     }
@@ -305,11 +291,14 @@ bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
         return false;
     }
 
+    msk.assign(HWLM_MASKLEN, 0);
+    cmp.assign(HWLM_MASKLEN, 0);
+
     size_t num = 0;
     vector<u8> v_msk, v_cmp;
 
     for (RoseVertex v : info.vertices) {
-        if (!findHamsterMask(build, id, info, v, v_msk, v_cmp)) {
+        if (!addSurroundingMask(build, id, v, v_msk, v_cmp)) {
             DEBUG_PRINTF("no mask\n");
             return false;
         }
@@ -364,14 +353,6 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
             continue;
         }
 
-        if (!lit.msk.empty()) {
-            continue;
-        }
-
-        const auto &lit_info = build.literal_info.at(id);
-        if (lit_info.requires_benefits) {
-            continue;
-        }
         candidates.push_back(id);
     }
 
@@ -380,14 +361,15 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
         auto &lit_info = build.literal_info.at(id);
 
         vector<u8> msk, cmp;
-        if (!findHamsterMask(build, lit, lit_info, msk, cmp)) {
+        if (!addSurroundingMask(build, lit, lit_info, msk, cmp)) {
             continue;
         }
-        assert(!msk.empty());
-        DEBUG_PRINTF("found advisory mask for lit_id=%u (%s)\n", id,
+        DEBUG_PRINTF("found surrounding mask for lit_id=%u (%s)\n", id,
                      dumpString(lit.s).c_str());
         u32 new_id = build.getLiteralId(lit.s, msk, cmp, lit.delay, lit.table);
-        assert(new_id != id);
+        if (new_id == id) {
+            continue;
+        }
         DEBUG_PRINTF("replacing with new lit_id=%u\n", new_id);
 
         // Note that our new literal may already exist and have vertices, etc.
@@ -409,6 +391,51 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
     }
 }
 
+// The mask already associated with the literal and any mask due to
+// mixed-case is mandatory.
+static
+void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
+                    vector<u8> &cmp) {
+    const size_t suffix_len = min(id.s.length(), size_t{HWLM_MASKLEN});
+    bool mixed_suffix = mixed_sensitivity_in(id.s.end() - suffix_len,
+                                             id.s.end());
+
+    if (id.msk.empty() && !mixed_suffix) {
+        return;
+    }
+
+    while (msk.size() < HWLM_MASKLEN) {
+        msk.insert(msk.begin(), 0);
+        cmp.insert(cmp.begin(), 0);
+    }
+
+    if (!id.msk.empty()) {
+        assert(id.msk.size() <= HWLM_MASKLEN);
+        assert(id.msk.size() == id.cmp.size());
+        for (size_t i = 0; i < id.msk.size(); i++) {
+            size_t mand_offset = msk.size() - i - 1;
+            size_t lit_offset = id.msk.size() - i - 1;
+            msk[mand_offset] = id.msk[lit_offset];
+            cmp[mand_offset] = id.cmp[lit_offset];
+        }
+    }
+
+    if (mixed_suffix) {
+        auto it = id.s.rbegin();
+        for (size_t i = 0; i < suffix_len; ++i, ++it) {
+            const auto &c = *it;
+            if (!c.nocase) {
+                size_t offset = HWLM_MASKLEN - i - 1;
+                DEBUG_PRINTF("offset %zu must match 0x%02x exactly\n", offset,
+                             c.c);
+                make_and_cmp_mask(c, &msk[offset], &cmp[offset]);
+            }
+        }
+    }
+
+    normaliseLiteralMask(id.s, msk, cmp);
+}
+
 static
 bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
                         const rose_literal_info &info) {
@@ -530,6 +557,17 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
     return true;
 }
 
+static
+bool isNoRunsFragment(const RoseBuildImpl &build, const LitFragment &f,
+                      const size_t max_len) {
+    // For the fragment to be marked "no runs", every literal it fires must
+    // need no further confirmation work.
+    return all_of_in(f.lit_ids, [&](u32 lit_id) {
+        const auto &info = build.literal_info.at(lit_id);
+        return isNoRunsLiteral(build, lit_id, info, max_len);
+    });
+}
+
 static
 const raw_puff &getChainedPuff(const RoseBuildImpl &build,
                                const Report &report) {
@@ -660,6 +698,78 @@ struct MatcherProto {
 };
 }
 
+static
+void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
+                        const LitFragment &f, u32 id, size_t max_len) {
+    const rose_literal_id &lit = build.literals.at(id);
+
+    DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(lit.s).c_str(),
+                 lit.s.length());
+
+    vector<u8> msk = lit.msk; // copy
+    vector<u8> cmp = lit.cmp; // copy
+
+    bool noruns = isNoRunsFragment(build, f, max_len);
+    DEBUG_PRINTF("fragment is %s\n", noruns ? "noruns" : "not noruns");
+
+    auto lit_final = lit.s; // copy
+
+    if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
+        DEBUG_PRINTF("truncating to tail of length %zu\n",
+                     size_t{ROSE_SHORT_LITERAL_LEN_MAX});
+        lit_final.erase(0, lit_final.length() - ROSE_SHORT_LITERAL_LEN_MAX);
+        // We shouldn't have set a threshold below 8 chars.
+        assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
+        assert(!noruns);
+    }
+
+    addLiteralMask(lit, msk, cmp);
+
+    const auto &s_final = lit_final.get_string();
+    bool nocase = lit_final.any_nocase();
+
+    DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n",
+                 f.fragment_id, escapeString(s_final).c_str(), (int)nocase,
+                 noruns, dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+
+    if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
+        DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+        return;
+    }
+
+    const auto &groups = f.groups;
+
+    mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id,
+                         groups, msk, cmp);
+}
+
+static
+void addAccelLiteral(MatcherProto &mp, const rose_literal_id &lit,
+                     const rose_literal_info &info, size_t max_len) {
+    const auto &s = lit.s; // copy
+
+    DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(s).c_str(), s.length());
+
+    vector<u8> msk = lit.msk; // copy
+    vector<u8> cmp = lit.cmp; // copy
+    addLiteralMask(lit, msk, cmp);
+
+    if (!maskIsConsistent(s.get_string(), s.any_nocase(), msk, cmp)) {
+        DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+        return;
+    }
+
+    // Literals used for acceleration must be limited to max_len, as that's all
+    // we can see in history.
+    string s_final = lit.s.get_string();
+    trim_to_suffix(s_final, max_len);
+    trim_to_suffix(msk, max_len);
+    trim_to_suffix(cmp, max_len);
+
+    mp.accel_lits.emplace_back(s_final, lit.s.any_nocase(), msk, cmp,
+                               info.group_mask);
+}
+
 /**
  * \brief Build up a vector of literals (and associated other data) for the
  * given table.
@@ -679,26 +789,27 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
         assert(build.cc.streaming);
     }
 
+    vector<u32> used_lit_ids;
+
     for (const auto &f : fragments) {
+        assert(!f.lit_ids.empty());
+
+        // All literals that share a fragment are in the same table.
+        if (build.literals.at(f.lit_ids.front()).table != table) {
+            continue; // next fragment.
+        }
+
+        DEBUG_PRINTF("fragment %u, %zu lit_ids\n", f.fragment_id,
+                     f.lit_ids.size());
+
+        used_lit_ids.clear();
         for (u32 id : f.lit_ids) {
             const rose_literal_id &lit = build.literals.at(id);
-
-            if (lit.table != table) {
-                continue; /* wrong table */
-            }
-
-            if (lit.delay) {
-                continue;  /* delay id's are virtual-ish */
-            }
-
             assert(id < build.literal_info.size());
             const auto &info = build.literal_info.at(id);
-
-            /* Note: requires_benefits are handled in the literal entries */
-            const ue2_literal &s = lit.s;
-
-            DEBUG_PRINTF("lit='%s' (len %zu)\n", escapeString(s).c_str(),
-                         s.length());
+            if (lit.delay) {
+                continue; /* delay id's are virtual-ish */
+            }
 
             // When building the delay rebuild table, we only want to include
             // literals that have delayed variants.
@@ -716,67 +827,38 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
                 }
             }
 
-            const vector<u8> &msk = lit.msk;
-            const vector<u8> &cmp = lit.cmp;
-            bool noruns = isNoRunsLiteral(build, id, info, max_len);
+            used_lit_ids.push_back(id);
+        }
 
-            size_t lit_hist_len = 0;
+        if (used_lit_ids.empty()) {
+            continue; // next fragment.
+        }
+
+        // Build our fragment (for the HWLM matcher) from the first literal.
+        addFragmentLiteral(build, mp, f, used_lit_ids.front(), max_len);
+
+        for (u32 id : used_lit_ids) {
+            const rose_literal_id &lit = build.literals.at(id);
+            assert(id < build.literal_info.size());
+            const auto &info = build.literal_info.at(id);
+
+            // All literals contribute accel information.
+            addAccelLiteral(mp, lit, info, max_len);
+
+            // All literals contribute to history requirement in streaming mode.
             if (build.cc.streaming) {
-                lit_hist_len = max(msk.size(), min(s.length(), max_len));
+                size_t lit_hist_len =
+                    max(lit.msk.size(), min(lit.s.length(), max_len));
                 lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
+                DEBUG_PRINTF("lit requires %zu bytes of history\n",
+                             lit_hist_len);
+                assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
+                mp.history_required = max(mp.history_required, lit_hist_len);
             }
-            DEBUG_PRINTF("lit requires %zu bytes of history\n", lit_hist_len);
-            assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
-
-            auto lit_final = s; // copy
-
-            if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
-                DEBUG_PRINTF("truncating to tail of length %zu\n",
-                             size_t{ROSE_SHORT_LITERAL_LEN_MAX});
-                lit_final.erase(0, lit_final.length()
-                                - ROSE_SHORT_LITERAL_LEN_MAX);
-                // We shouldn't have set a threshold below 8 chars.
-                assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
-                assert(!noruns);
-            }
-
-            const auto &s_final = lit_final.get_string();
-            bool nocase = lit_final.any_nocase();
-
-            DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
-                         "cmp=%s\n", f.fragment_id,
-                         escapeString(s_final).c_str(), (int)nocase, noruns,
-                         dumpMask(msk).c_str(), dumpMask(cmp).c_str());
-
-            if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
-                DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
-                continue;
-            }
-
-            mp.accel_lits.emplace_back(s.get_string(), s.any_nocase(), msk, cmp,
-                                       info.group_mask);
-            mp.history_required = max(mp.history_required, lit_hist_len);
-
-            u32 prog_offset = delay_rebuild ? f.delay_program_offset
-                                            : f.lit_program_offset;
-            const auto &groups = f.groups;
-
-            mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset,
-                                 groups, msk, cmp);
         }
     }
 
     sort_and_unique(mp.lits);
-
-    // Literals used for acceleration must be limited to max_len, as that's all
-    // we can see in history.
-    for_each(begin(mp.accel_lits), end(mp.accel_lits),
-             [&max_len](AccelString &a) {
-                 trim_to_suffix(a.s, max_len);
-                 trim_to_suffix(a.msk, max_len);
-                 trim_to_suffix(a.cmp, max_len);
-             });
-
     sort_and_unique(mp.accel_lits);
 
     return mp;
@@ -791,8 +873,8 @@ void MatcherProto::insert(const MatcherProto &a) {
 }
 
 static
-void buildAccel(const RoseBuildImpl &build, const MatcherProto &mp,
-                HWLM &hwlm) {
+void buildAccel(const RoseBuildImpl &build,
+                const vector<AccelString> &accel_lits, HWLM &hwlm) {
     if (!build.cc.grey.hamsterAccelForward) {
         return;
     }
@@ -801,49 +883,68 @@ void buildAccel(const RoseBuildImpl &build, const MatcherProto &mp,
         return;
     }
 
-    buildForwardAccel(&hwlm, mp.accel_lits, build.getInitialGroups());
+    buildForwardAccel(&hwlm, accel_lits, build.getInitialGroups());
 }
 
-bytecode_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
-                                        const vector<LitFragment> &fragments,
-                                        size_t longLitLengthThreshold,
-                                        rose_group *fgroups,
-                                        size_t *historyRequired) {
-    *fgroups = 0;
-
-    auto mp = makeMatcherProto(build, fragments, ROSE_FLOATING, false,
-                               longLitLengthThreshold);
-    if (mp.lits.empty()) {
-        DEBUG_PRINTF("empty floating matcher\n");
+bytecode_ptr<HWLM>
+buildHWLMMatcher(const RoseBuildImpl &build, LitProto *litProto) {
+    if (!litProto) {
         return nullptr;
     }
-    dumpMatcherLiterals(mp.lits, "floating", build.cc.grey);
-
-    for (const hwlmLiteral &lit : mp.lits) {
-        *fgroups |= lit.groups;
-    }
-
-    auto hwlm = hwlmBuild(mp.lits, false, build.cc, build.getInitialGroups());
+    auto hwlm = hwlmBuild(*litProto->hwlmProto, build.cc,
+                          build.getInitialGroups());
     if (!hwlm) {
         throw CompileError("Unable to generate bytecode.");
     }
 
-    buildAccel(build, mp, *hwlm);
+    buildAccel(build, litProto->accel_lits, *hwlm);
 
-    if (build.cc.streaming) {
-        DEBUG_PRINTF("history_required=%zu\n", mp.history_required);
-        assert(mp.history_required <= build.cc.grey.maxHistoryAvailable);
-        *historyRequired = max(*historyRequired, mp.history_required);
-    }
-
-    DEBUG_PRINTF("built floating literal table size %zu bytes\n", hwlm.size());
+    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n",
+                 hwlm.size());
     return hwlm;
 }
 
-bytecode_ptr<HWLM>
-buildDelayRebuildMatcher(const RoseBuildImpl &build,
-                         const vector<LitFragment> &fragments,
-                         size_t longLitLengthThreshold) {
+unique_ptr<LitProto>
+buildFloatingMatcherProto(const RoseBuildImpl &build,
+                          const vector<LitFragment> &fragments,
+                          size_t longLitLengthThreshold,
+                          rose_group *fgroups,
+                          size_t *historyRequired) {
+    DEBUG_PRINTF("Floating literal matcher\n");
+    *fgroups = 0;
+
+     auto mp = makeMatcherProto(build, fragments, ROSE_FLOATING, false,
+                                           longLitLengthThreshold);
+     if (mp.lits.empty()) {
+         DEBUG_PRINTF("empty floating matcher\n");
+         return nullptr;
+     }
+     dumpMatcherLiterals(mp.lits, "floating", build.cc.grey);
+
+     for (const hwlmLiteral &lit : mp.lits) {
+         *fgroups |= lit.groups;
+     }
+
+     if (build.cc.streaming) {
+         DEBUG_PRINTF("history_required=%zu\n", mp.history_required);
+         assert(mp.history_required <= build.cc.grey.maxHistoryAvailable);
+         *historyRequired = max(*historyRequired, mp.history_required);
+     }
+
+     auto proto = hwlmBuildProto(mp.lits, false, build.cc);
+
+     if (!proto) {
+        throw CompileError("Unable to generate literal matcher proto.");
+     }
+
+     return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
+}
+
+unique_ptr<LitProto>
+buildDelayRebuildMatcherProto(const RoseBuildImpl &build,
+                              const vector<LitFragment> &fragments,
+                              size_t longLitLengthThreshold) {
+    DEBUG_PRINTF("Delay literal matcher\n");
     if (!build.cc.streaming) {
         DEBUG_PRINTF("not streaming\n");
         return nullptr;
@@ -857,20 +958,20 @@ buildDelayRebuildMatcher(const RoseBuildImpl &build,
     }
     dumpMatcherLiterals(mp.lits, "delay_rebuild", build.cc.grey);
 
-    auto hwlm = hwlmBuild(mp.lits, false, build.cc, build.getInitialGroups());
-    if (!hwlm) {
-        throw CompileError("Unable to generate bytecode.");
+
+    auto proto = hwlmBuildProto(mp.lits, false, build.cc);
+
+    if (!proto) {
+        throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    buildAccel(build, mp, *hwlm);
-
-    DEBUG_PRINTF("built delay rebuild table size %zu bytes\n", hwlm.size());
-    return hwlm;
+    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
 }
 
-bytecode_ptr<HWLM>
-buildSmallBlockMatcher(const RoseBuildImpl &build,
-                       const vector<LitFragment> &fragments) {
+unique_ptr<LitProto>
+buildSmallBlockMatcherProto(const RoseBuildImpl &build,
+                            const vector<LitFragment> &fragments) {
+    DEBUG_PRINTF("Small block literal matcher\n");
     if (build.cc.streaming) {
         DEBUG_PRINTF("streaming mode\n");
         return nullptr;
@@ -915,21 +1016,19 @@ buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto hwlm = hwlmBuild(mp.lits, true, build.cc, build.getInitialGroups());
-    if (!hwlm) {
-        throw CompileError("Unable to generate bytecode.");
+    auto proto = hwlmBuildProto(mp.lits, false, build.cc);
+
+    if (!proto) {
+        throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    buildAccel(build, mp, *hwlm);
-
-    DEBUG_PRINTF("built small block literal table size %zu bytes\n",
-                 hwlm.size());
-    return hwlm;
+    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
 }
 
-bytecode_ptr<HWLM>
-buildEodAnchoredMatcher(const RoseBuildImpl &build,
-                        const vector<LitFragment> &fragments) {
+unique_ptr<LitProto>
+buildEodAnchoredMatcherProto(const RoseBuildImpl &build,
+                             const vector<LitFragment> &fragments) {
+    DEBUG_PRINTF("Eod anchored literal matcher\n");
     auto mp = makeMatcherProto(build, fragments, ROSE_EOD_ANCHORED, false,
                                build.ematcher_region_size);
 
@@ -942,16 +1041,13 @@ buildEodAnchoredMatcher(const RoseBuildImpl &build,
 
     assert(build.ematcher_region_size);
 
-    auto hwlm = hwlmBuild(mp.lits, true, build.cc, build.getInitialGroups());
-    if (!hwlm) {
-        throw CompileError("Unable to generate bytecode.");
+    auto proto = hwlmBuildProto(mp.lits, false, build.cc);
+
+    if (!proto) {
+        throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    buildAccel(build, mp, *hwlm);
-
-    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n",
-                 hwlm.size());
-    return hwlm;
+    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 2b1afc8c..ef8999ed 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -35,7 +35,10 @@
 #define ROSE_BUILD_MATCHERS_H
 
 #include "rose_build_impl.h"
+#include "rose_build_lit_accel.h"
+#include "hwlm/hwlm_build.h"
 #include "util/bytecode_ptr.h"
+#include "util/ue2string.h"
 
 #include <vector>
 
@@ -44,38 +47,80 @@ struct HWLM;
 
 namespace ue2 {
 
+static constexpr u32 INVALID_FRAG_ID = ~0U;
+
 struct LitFragment {
-    LitFragment(u32 fragment_id_in, rose_group groups_in, u32 lit_id)
-    : fragment_id(fragment_id_in), groups(groups_in), lit_ids({lit_id}) {}
-    LitFragment(u32 fragment_id_in, rose_group groups_in,
-                std::vector<u32> lit_ids_in)
-    : fragment_id(fragment_id_in), groups(groups_in),
-        lit_ids(std::move(lit_ids_in)) {}
+    LitFragment(u32 fragment_id_in, ue2_literal s_in,
+                rose_group groups_in, u32 lit_id)
+    : fragment_id(fragment_id_in), s(s_in), groups(groups_in),
+      lit_ids({lit_id}) {}
+    LitFragment(u32 fragment_id_in, ue2_literal s_in,
+                rose_group groups_in, std::vector<u32> lit_ids_in)
+    : fragment_id(fragment_id_in), s(s_in), groups(groups_in),
+      lit_ids(std::move(lit_ids_in)) {}
     u32 fragment_id;
+
+    /**
+     * \brief literal fragment.
+     */
+    ue2_literal s;
+
+    /**
+     * \brief FDR confirm squash mask for included literals.
+     */
+    u8 squash = 0;
+
+    /**
+     * \brief FDR confirm squash mask for included literals (Delayed
+     * literals only).
+     */
+    u8 delay_squash = 0;
+
+    /**
+     * \brief Fragment id of included literal.
+     */
+    u32 included_frag_id = INVALID_FRAG_ID;
+
+    /**
+     * \brief Fragment Id of included literal (Delayed literals only).
+     */
+    u32 included_delay_frag_id = INVALID_FRAG_ID;
     rose_group groups;
     std::vector<u32> lit_ids;
     u32 lit_program_offset = ROSE_INVALID_PROG_OFFSET;
     u32 delay_program_offset = ROSE_INVALID_PROG_OFFSET;
 };
 
-bytecode_ptr<HWLM>
-buildFloatingMatcher(const RoseBuildImpl &build,
-                     const std::vector<LitFragment> &fragments,
-                     size_t longLitLengthThreshold, rose_group *fgroups,
-                     size_t *historyRequired);
+struct LitProto {
+    LitProto(std::unique_ptr<HWLMProto> hwlmProto_in,
+             std::vector<AccelString> &accel_lits_in)
+    : hwlmProto(std::move(hwlmProto_in)), accel_lits(accel_lits_in) {}
+
+    std::unique_ptr<HWLMProto> hwlmProto;
+    std::vector<AccelString> accel_lits;
+};
 
 bytecode_ptr<HWLM>
-buildDelayRebuildMatcher(const RoseBuildImpl &build,
-                         const std::vector<LitFragment> &fragments,
-                         size_t longLitLengthThreshold);
+buildHWLMMatcher(const RoseBuildImpl &build, LitProto *proto);
 
-bytecode_ptr<HWLM>
-buildSmallBlockMatcher(const RoseBuildImpl &build,
-                       const std::vector<LitFragment> &fragments);
+std::unique_ptr<LitProto>
+buildFloatingMatcherProto(const RoseBuildImpl &build,
+                          const std::vector<LitFragment> &fragments,
+                          size_t longLitLengthThreshold,
+                          rose_group *fgroups,
+                          size_t *historyRequired);
 
-bytecode_ptr<HWLM>
-buildEodAnchoredMatcher(const RoseBuildImpl &build,
-                        const std::vector<LitFragment> &fragments);
+std::unique_ptr<LitProto>
+buildDelayRebuildMatcherProto(const RoseBuildImpl &build,
+                              const std::vector<LitFragment> &fragments,
+                              size_t longLitLengthThreshold);
+std::unique_ptr<LitProto>
+buildSmallBlockMatcherProto(const RoseBuildImpl &build,
+                            const std::vector<LitFragment> &fragments);
+
+std::unique_ptr<LitProto>
+buildEodAnchoredMatcherProto(const RoseBuildImpl &build,
+                             const std::vector<LitFragment> &fragments);
 
 void findMoreLiteralMasks(RoseBuildImpl &build);
 
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index d638e589..c0eba22b 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -63,9 +63,12 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/hash.h"
+#include "util/insertion_ordered.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 #include <functional>
@@ -77,12 +80,11 @@
 #include <vector>
 #include <utility>
 
-#include <boost/functional/hash/hash_fwd.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
 using boost::adaptors::map_values;
-using boost::hash_combine;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
@@ -100,10 +102,6 @@ static const size_t DFA_CHUNK_SIZE_MAX = 200;
 /** \brief Max DFA states in a merged DFA. */
 static const size_t DFA_MERGE_MAX_STATES = 8000;
 
-/** \brief An LBR must have at least this many vertices to be protected from
- * merging with other graphs. */
-static const size_t LARGE_LBR_MIN_VERTICES = 32;
-
 /** \brief In block mode, merge two prefixes even if they don't have identical
  * literal sets if they have fewer than this many states and the merged graph
  * is also small. */
@@ -121,327 +119,22 @@ size_t small_rose_threshold(const CompileContext &cc) {
                         : SMALL_ROSE_THRESHOLD_BLOCK;
 }
 
-static
-bool isLargeLBR(const NGHolder &g, const Grey &grey) {
-    if (num_vertices(g) < LARGE_LBR_MIN_VERTICES) {
-        return false;
-    }
-    return isLBR(g, grey);
-}
-
-namespace {
-struct DupeLeafKey {
-    explicit DupeLeafKey(const RoseVertexProps &litv)
-        : literals(litv.literals), reports(litv.reports),
-          eod_accept(litv.eod_accept), suffix(litv.suffix), left(litv.left),
-          som_adjust(litv.som_adjust) {
-        DEBUG_PRINTF("eod_accept %d\n", (int)eod_accept);
-        DEBUG_PRINTF("report %u\n", left.leftfix_report);
-        DEBUG_PRINTF("lag %u\n", left.lag);
-    }
-
-    bool operator<(const DupeLeafKey &b) const {
-        const DupeLeafKey &a = *this;
-        ORDER_CHECK(literals);
-        ORDER_CHECK(eod_accept);
-        ORDER_CHECK(suffix);
-        ORDER_CHECK(reports);
-        ORDER_CHECK(som_adjust);
-        ORDER_CHECK(left.leftfix_report);
-        ORDER_CHECK(left.lag);
-        return false;
-    }
-
-    flat_set<u32> literals;
-    flat_set<ReportID> reports;
-    bool eod_accept;
-    suffix_id suffix;
-    LeftEngInfo left;
-    u32 som_adjust;
-};
-
-struct UncalcLeafKey {
-    UncalcLeafKey(const RoseGraph &g, RoseVertex v)
-        : literals(g[v].literals), rose(g[v].left) {
-        for (const auto &e : in_edges_range(v, g)) {
-            RoseVertex u = source(e, g);
-            preds.insert(make_pair(u, g[e]));
-        }
-    }
-
-    bool operator<(const UncalcLeafKey &b) const {
-        const UncalcLeafKey &a = *this;
-        ORDER_CHECK(literals);
-        ORDER_CHECK(preds);
-        ORDER_CHECK(rose);
-        return false;
-    }
-
-    flat_set<u32> literals;
-    flat_set<pair<RoseVertex, RoseEdgeProps>> preds;
-    LeftEngInfo rose;
-};
-} // namespace
-
-/**
- * This function merges leaf vertices with the same literals and report
- * id/suffix. The leaf vertices of the graph are inspected and a mapping of
- * leaf vertex properties to vertices is built. If the same set of leaf
- * properties has already been seen when we inspect a vertex, we attempt to
- * merge the vertex in with the previously seen vertex. This process can fail
- * if the vertices share a common predecessor vertex but have a differing,
- * incompatible relationship (different bounds or infix) with the predecessor.
- *
- * This takes place after \ref dedupeSuffixes to increase effectiveness as the
- * same suffix is required for a merge to occur.
- */
-void mergeDupeLeaves(RoseBuildImpl &tbi) {
-    map<DupeLeafKey, RoseVertex> leaves;
-    vector<RoseVertex> changed;
-
-    RoseGraph &g = tbi.g;
-    for (auto v : vertices_range(g)) {
-        if (in_degree(v, g) == 0) {
-            assert(tbi.isAnyStart(v));
-            continue;
-        }
-
-        DEBUG_PRINTF("inspecting vertex index=%zu in_degree %zu "
-                     "out_degree %zu\n", g[v].index, in_degree(v, g),
-                     out_degree(v, g));
-
-        // Vertex must be a reporting leaf node
-        if (g[v].reports.empty() || !isLeafNode(v, g)) {
-            continue;
-        }
-
-        // At the moment, we ignore all successors of root or anchored_root,
-        // since many parts of our runtime assume that these have in-degree 1.
-        if (tbi.isRootSuccessor(v)) {
-            continue;
-        }
-
-        DupeLeafKey dupe(g[v]);
-        if (leaves.find(dupe) == leaves.end()) {
-            leaves.insert(make_pair(dupe, v));
-            continue;
-        }
-
-        RoseVertex t = leaves.find(dupe)->second;
-        DEBUG_PRINTF("found two leaf dupe roles, index=%zu,%zu\n", g[v].index,
-                     g[t].index);
-
-        vector<RoseEdge> deadEdges;
-        for (const auto &e : in_edges_range(v, g)) {
-            RoseVertex u = source(e, g);
-            DEBUG_PRINTF("u index=%zu\n", g[u].index);
-            if (RoseEdge et = edge(u, t, g)) {
-                if (g[et].minBound <= g[e].minBound
-                    && g[et].maxBound >= g[e].maxBound) {
-                    DEBUG_PRINTF("remove more constrained edge\n");
-                    deadEdges.push_back(e);
-                }
-            } else {
-                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].index,
-                             g[t].index);
-                add_edge(u, t, g[e], g);
-                deadEdges.push_back(e);
-            }
-        }
-
-        if (!deadEdges.empty()) {
-            for (auto &e : deadEdges) {
-                remove_edge(e, g);
-            }
-            changed.push_back(v);
-            g[t].min_offset = min(g[t].min_offset, g[v].min_offset);
-            g[t].max_offset = max(g[t].max_offset, g[v].max_offset);
-        }
-    }
-    DEBUG_PRINTF("find loop done\n");
-
-    // Remove any vertices that now have no in-edges.
-    size_t countRemovals = 0;
-    for (size_t i = 0; i < changed.size(); i++) {
-        RoseVertex v = changed[i];
-        if (in_degree(v, g) == 0) {
-            DEBUG_PRINTF("remove vertex\n");
-            if (!tbi.isVirtualVertex(v)) {
-                for (u32 lit_id : g[v].literals) {
-                    tbi.literal_info[lit_id].vertices.erase(v);
-                }
-            }
-            remove_vertex(v, g);
-            countRemovals++;
-        }
-    }
-
-    // if we've removed anything, we need to renumber vertices
-    if (countRemovals) {
-        renumber_vertices(g);
-        DEBUG_PRINTF("removed %zu vertices.\n", countRemovals);
-    }
-}
-
-/** Merges the suffixes on the (identical) vertices in \a vcluster, used by
- * \ref uncalcLeaves. */
-static
-void mergeCluster(RoseGraph &g, const ReportManager &rm,
-                  const vector<RoseVertex> &vcluster,
-                  vector<RoseVertex> &dead, const CompileContext &cc) {
-    if (vcluster.size() <= 1) {
-        return; // No merge to perform.
-    }
-
-    // Note that we batch merges up fairly crudely for performance reasons.
-    vector<RoseVertex>::const_iterator it = vcluster.begin(), it2;
-    while (it != vcluster.end()) {
-        vector<NGHolder *> cluster;
-        map<NGHolder *, RoseVertex> rev;
-
-        for (it2 = it;
-             it2 != vcluster.end() && cluster.size() < MERGE_GROUP_SIZE_MAX;
-             ++it2) {
-            RoseVertex v = *it2;
-            NGHolder *h = g[v].suffix.graph.get();
-            assert(!g[v].suffix.haig); /* should not be here if haig */
-            rev[h] = v;
-            cluster.push_back(h);
-        }
-        it = it2;
-
-        DEBUG_PRINTF("merging cluster %zu\n", cluster.size());
-        auto merged = mergeNfaCluster(cluster, &rm, cc);
-        DEBUG_PRINTF("done\n");
-
-        for (const auto &m : merged) {
-            NGHolder *h_victim = m.first; // mergee
-            NGHolder *h_winner = m.second;
-            RoseVertex victim = rev[h_victim];
-            RoseVertex winner = rev[h_winner];
-
-            LIMIT_TO_AT_MOST(&g[winner].min_offset, g[victim].min_offset);
-            ENSURE_AT_LEAST(&g[winner].max_offset, g[victim].max_offset);
-            insert(&g[winner].reports, g[victim].reports);
-
-            dead.push_back(victim);
-        }
-    }
-}
-
-static
-void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
-                           map<UncalcLeafKey, vector<RoseVertex> > &clusters,
-                           deque<UncalcLeafKey> &ordered) {
-    const RoseGraph &g = tbi.g;
-
-    vector<RoseVertex> suffix_vertices; // vertices with suffix graphs
-    ue2::unordered_map<const NGHolder *, u32> fcount; // ref count per graph
-
-    for (auto v : vertices_range(g)) {
-        if (g[v].suffix) {
-            if (!g[v].suffix.graph) {
-                continue; /* cannot uncalc (haig/mcclellan); TODO */
-            }
-
-            assert(g[v].suffix.graph->kind == NFA_SUFFIX);
-
-            // Ref count all suffixes, as we don't want to merge a suffix
-            // that happens to be shared with a non-leaf vertex somewhere.
-            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].index,
-                         g[v].suffix.graph.get());
-            fcount[g[v].suffix.graph.get()]++;
-
-            // Vertex must be a reporting pseudo accept
-            if (!isLeafNode(v, g)) {
-                continue;
-            }
-
-            suffix_vertices.push_back(v);
-        }
-    }
-
-    for (auto v : suffix_vertices) {
-        if (in_degree(v, g) == 0) {
-            assert(tbi.isAnyStart(v));
-            continue;
-        }
-
-        const NGHolder *h = g[v].suffix.graph.get();
-        assert(h);
-        DEBUG_PRINTF("suffix %p\n", h);
-
-        // We can't easily merge suffixes shared with other vertices, and
-        // creating a unique copy to do so may just mean we end up tracking
-        // more NFAs. Better to leave shared suffixes alone.
-        if (fcount[h] != 1) {
-            DEBUG_PRINTF("skipping shared suffix\n");
-            continue;
-        }
-
-        UncalcLeafKey key(g, v);
-        vector<RoseVertex> &vec = clusters[key];
-        if (vec.empty()) {
-
-            ordered.push_back(key);
-        }
-        vec.push_back(v);
-    }
-
-    DEBUG_PRINTF("find loop done\n");
-}
-
-/**
- * This function attempts to combine identical roles (same literals, same
- * predecessors, etc) with different suffixes into a single role which
- * activates a larger suffix. The leaf vertices of the graph with a suffix are
- * grouped into clusters which have members triggered by identical roles. The
- * \ref mergeNfaCluster function (from ng_uncalc_components) is then utilised
- * to build a set of larger (and still implementable) suffixes. The graph is
- * then updated to point to the new suffixes and any unneeded roles are
- * removed.
- *
- * Note: suffixes which are shared amongst multiple roles are not considered
- * for this pass as the individual suffixes would have to continue to exist for
- * the other roles to trigger resulting in the transformation not producing any
- * savings.
- *
- * Note: as \ref mergeNfaCluster is slow when the cluster sizes are large,
- * clusters of more than \ref MERGE_GROUP_SIZE_MAX roles are split into smaller
- * chunks for processing.
- */
-void uncalcLeaves(RoseBuildImpl &tbi) {
-    DEBUG_PRINTF("uncalcing\n");
-
-    map<UncalcLeafKey, vector<RoseVertex> > clusters;
-    deque<UncalcLeafKey> ordered;
-    findUncalcLeavesCandidates(tbi, clusters, ordered);
-
-    vector<RoseVertex> dead;
-
-    for (const auto &key : ordered) {
-        DEBUG_PRINTF("cluster of size %zu\n", clusters[key].size());
-        mergeCluster(tbi.g, tbi.rm, clusters[key], dead, tbi.cc);
-    }
-    tbi.removeVertices(dead);
-}
-
 /**
  * Returns a loose hash of a leftfix for use in dedupeLeftfixes. Note that
  * reports should not contribute to the hash.
  */
 static
-size_t hashLeftfix(const LeftEngInfo &left) {
+size_t hashLeftfix(const left_id &left) {
     size_t val = 0;
 
-    if (left.castle) {
-        hash_combine(val, left.castle->reach());
-        for (const auto &pr : left.castle->repeats) {
+    if (left.castle()) {
+        hash_combine(val, left.castle()->reach());
+        for (const auto &pr : left.castle()->repeats) {
             hash_combine(val, pr.first); // top
             hash_combine(val, pr.second.bounds);
         }
-    } else if (left.graph) {
-        hash_combine(val, hash_holder(*left.graph));
+    } else if (left.graph()) {
+        hash_combine(val, hash_holder(*left.graph()));
     }
 
     return val;
@@ -487,33 +180,24 @@ private:
 };
 
 /**
- * Trivial Rose comparator intended to find graphs that are identical except
- * for their report IDs. Relies on vertex and edge indices to pick up graphs
- * that have been messily put together in different orderings...
+ * Intended to find graphs that are identical except for their report
+ * IDs. Relies on vertex and edge indices to pick up graphs that have been
+ * messily put together in different orderings. Only implemented for castles and
+ * holders.
  */
-struct RoseComparator {
-    explicit RoseComparator(const RoseGraph &g_in) : g(g_in) {}
-
-    bool operator()(const RoseVertex u, const RoseVertex v) const {
-        const LeftEngInfo &u_left = g[u].left;
-        const LeftEngInfo &v_left = g[v].left;
-
-        if (u_left.castle && v_left.castle) {
-            return is_equal(*u_left.castle, u_left.leftfix_report,
-                            *v_left.castle, v_left.leftfix_report);
-        }
-
-        if (!u_left.graph || !v_left.graph) {
-            return false;
-        }
-
-        return is_equal(*u_left.graph, u_left.leftfix_report, *v_left.graph,
-                        v_left.leftfix_report);
+static
+bool is_equal(const left_id &u_left, ReportID u_report,
+              const left_id &v_left, ReportID v_report) {
+    if (u_left.castle() && v_left.castle()) {
+        return is_equal(*u_left.castle(), u_report, *v_left.castle(), v_report);
     }
 
-private:
-    const RoseGraph &g;
-};
+    if (!u_left.graph() || !v_left.graph()) {
+        return false;
+    }
+
+    return is_equal(*u_left.graph(), u_report, *v_left.graph(), v_report);
+}
 
 } // namespace
 
@@ -528,8 +212,8 @@ private:
  *
  * Note: only roles with a single predecessor vertex are considered for this
  * transform - it should probably be generalised to work for roles which share
- * the same set of predecessor roles as for \ref dedupeLeftfixesVariableLag or it
- * should be retired entirely.
+ * the same set of predecessor roles as for \ref dedupeLeftfixesVariableLag or
+ * it should be retired entirely.
  */
 bool dedupeLeftfixes(RoseBuildImpl &tbi) {
     DEBUG_PRINTF("deduping leftfixes\n");
@@ -560,13 +244,11 @@ bool dedupeLeftfixes(RoseBuildImpl &tbi) {
 
     DEBUG_PRINTF("collected %zu rose groups\n", roses.size());
 
-    const RoseComparator rosecmp(g);
-
     // Walk groups and dedupe the roses therein.
     for (deque<RoseVertex> &verts : roses | map_values) {
         DEBUG_PRINTF("group has %zu vertices\n", verts.size());
 
-        ue2::unordered_set<left_id> seen;
+        unordered_set<left_id> seen;
 
         for (auto jt = verts.begin(), jte = verts.end(); jt != jte; ++jt) {
             RoseVertex v = *jt;
@@ -579,7 +261,9 @@ bool dedupeLeftfixes(RoseBuildImpl &tbi) {
 
             // Scan the rest of the list for dupes.
             for (auto kt = std::next(jt); kt != jte; ++kt) {
-                if (g[v].left == g[*kt].left || !rosecmp(v, *kt)) {
+                if (g[v].left == g[*kt].left
+                    || !is_equal(g[v].left, g[v].left.leftfix_report,
+                                 g[*kt].left, g[*kt].left.leftfix_report)) {
                     continue;
                 }
 
@@ -636,7 +320,7 @@ bool is_equal(const suffix_id &s1, const suffix_id &s2) {
 void dedupeSuffixes(RoseBuildImpl &tbi) {
     DEBUG_PRINTF("deduping suffixes\n");
 
-    ue2::unordered_map<suffix_id, set<RoseVertex>> suffix_map;
+    unordered_map<suffix_id, set<RoseVertex>> suffix_map;
     map<pair<size_t, set<ReportID>>, vector<suffix_id>> part;
 
     // Collect suffixes into groups.
@@ -703,7 +387,7 @@ template<class EngineRef>
 class Bouquet {
 private:
     list<EngineRef> ordering; // Unique list in insert order.
-    typedef ue2::unordered_map<EngineRef, deque<RoseVertex> > BouquetMap;
+    using BouquetMap = ue2_unordered_map<EngineRef, deque<RoseVertex>>;
     BouquetMap bouquet;
 public:
     void insert(const EngineRef &h, RoseVertex v) {
@@ -863,29 +547,27 @@ bool checkPrefix(const rose_literal_id &ul, const u32 ulag,
 static
 bool hasSameEngineType(const RoseVertexProps &u_prop,
                        const RoseVertexProps &v_prop) {
-    const left_id u_left(u_prop.left), v_left(v_prop.left);
+    const left_id u_left = u_prop.left;
+    const left_id v_left = v_prop.left;
 
-    if (u_left.haig() || v_left.haig()) {
-        if (u_left.graph() != v_left.graph()) {
-            return false;
-        }
-    }
-
-    if (u_left.dfa() || v_left.dfa()) {
-        if (u_left.graph() != v_left.graph()) {
-            return false;
-        }
-    }
-
-    if (u_left.castle() || v_left.castle()) {
-        if (!u_left.castle() || !v_left.castle()) {
-            return false; // Must both be castles.
-        }
-    }
-
-    return true;
+    return !u_left.haig() == !v_left.haig()
+        && !u_left.dfa() == !v_left.dfa()
+        && !u_left.castle() == !v_left.castle()
+        && !u_left.graph() == !v_left.graph();
 }
 
+/**
+ * Verifies that merging the leftfix of vertices does not cause conflicts due
+ * to the literals on the right.
+ *
+ * The main concern is that the lags of the literals and overlap between them
+ * allow the engine check offset to potentially regress.
+ *
+ * Parameters are vectors of literals + lag pairs.
+ *
+ * Note: if more constaints of when the leftfixes were going to be checked
+ * (mandatory lookarounds passing, offset checks), more merges may be allowed.
+ */
 static
 bool compatibleLiteralsForMerge(
                      const vector<pair<const rose_literal_id *, u32>> &ulits,
@@ -899,6 +581,21 @@ bool compatibleLiteralsForMerge(
         return false;
     }
 
+    // We don't handle delayed cases yet.
+    for (const auto &ue : ulits) {
+        const rose_literal_id &ul = *ue.first;
+        if (ul.delay) {
+            return false;
+        }
+    }
+
+    for (const auto &ve : vlits) {
+        const rose_literal_id &vl = *ve.first;
+        if (vl.delay) {
+            return false;
+        }
+    }
+
     /* An engine requires that all accesses to it are ordered by offsets. (ie,
        we can not check an engine's state at offset Y, if we have already
        checked its status at offset X and X > Y). If we can not establish that
@@ -908,18 +605,10 @@ bool compatibleLiteralsForMerge(
         const rose_literal_id &ul = *ue.first;
         u32 ulag = ue.second;
 
-        if (ul.delay) {
-            return false; // We don't handle delayed cases yet.
-        }
-
         for (const auto &ve : vlits) {
             const rose_literal_id &vl = *ve.first;
             u32 vlag = ve.second;
 
-            if (vl.delay) {
-                return false; // We don't handle delayed cases yet.
-            }
-
             if (!checkPrefix(ul, ulag, vl, vlag)
                 || !checkPrefix(vl, vlag, ul, ulag)) {
                 DEBUG_PRINTF("prefix check failed\n");
@@ -944,8 +633,8 @@ bool isAccelerableLeftfix(const RoseBuildImpl &build, const NGHolder &g) {
 }
 
 /**
- * In block mode, we want to be a little more selective, We will only merge
- * prefix engines when the literal sets are the same, or if the merged graph
+ * In block mode, we want to be a little more selective -- We will only merge
+ * prefix engines when the literal sets are the same or if the merged graph
  * has only grown by a small amount.
  */
 static
@@ -1101,12 +790,13 @@ bool checkPredDelay(const rose_literal_id &ul, const rose_literal_id &vl,
     return true;
 }
 
+template<typename VertexCont>
 static never_inline
-bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
-                     const deque<RoseVertex> &v2) {
+bool checkPredDelays(const RoseBuildImpl &build, const VertexCont &v1,
+                     const VertexCont &v2) {
     flat_set<RoseVertex> preds;
     for (auto v : v1) {
-        insert(&preds, inv_adjacent_vertices(v, tbi.g));
+        insert(&preds, inv_adjacent_vertices(v, build.g));
     }
 
     flat_set<u32> pred_lits;
@@ -1118,29 +808,29 @@ bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
      * the literal is no longer available. */
     flat_set<RoseVertex> known_good_preds;
     for (auto v : v2) {
-        insert(&known_good_preds, inv_adjacent_vertices(v, tbi.g));
+        insert(&known_good_preds, inv_adjacent_vertices(v, build.g));
     }
 
     for (auto u : preds) {
         if (!contains(known_good_preds, u)) {
-            insert(&pred_lits, tbi.g[u].literals);
+            insert(&pred_lits, build.g[u].literals);
         }
     }
 
     vector<const rose_literal_id *> pred_rose_lits;
     pred_rose_lits.reserve(pred_lits.size());
     for (const auto &p : pred_lits) {
-        pred_rose_lits.push_back(&tbi.literals.at(p));
+        pred_rose_lits.push_back(&build.literals.at(p));
     }
 
     for (auto v : v2) {
-        u32 vlag = tbi.g[v].left.lag;
+        u32 vlag = build.g[v].left.lag;
         if (!vlag) {
             continue;
         }
 
-        for (const u32 vlit : tbi.g[v].literals) {
-            const rose_literal_id &vl = tbi.literals.at(vlit);
+        for (const u32 vlit : build.g[v].literals) {
+            const rose_literal_id &vl = build.literals.at(vlit);
             assert(!vl.delay); // this should never have got this far?
             for (const auto &ul : pred_rose_lits) {
                 assert(!ul->delay); // this should never have got this far?
@@ -1189,7 +879,7 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi,
 
     vector<pair<const rose_literal_id *, u32>> ulits; /* lit + lag pairs */
     for (auto a : verts1) {
-        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u_front, a)) {
+        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, v_front, a)) {
             return false;
         }
 
@@ -1278,24 +968,35 @@ struct RoseMergeCandidate {
 }
 
 static
-bool mergeRosePair(RoseBuildImpl &tbi, left_id &r1, left_id &r2,
-                   const deque<RoseVertex> &verts1,
-                   const deque<RoseVertex> &verts2) {
+bool mergeLeftfixPair(RoseBuildImpl &build, left_id &r1, left_id &r2,
+                      const vector<RoseVertex> &verts1,
+                      const vector<RoseVertex> &verts2) {
     assert(!verts1.empty() && !verts2.empty());
 
-    RoseGraph &g = tbi.g;
+    DEBUG_PRINTF("merging pair of leftfixes:\n");
+    DEBUG_PRINTF("  A:%016zx: tops %s\n", r1.hash(),
+                 as_string_list(all_tops(r1)).c_str());
+    DEBUG_PRINTF("  B:%016zx: tops %s\n", r2.hash(),
+                 as_string_list(all_tops(r2)).c_str());
+
+    RoseGraph &g = build.g;
 
     if (r1.graph()) {
         assert(r2.graph());
         assert(r1.graph()->kind == r2.graph()->kind);
-        if (!mergeNfaPair(*r1.graph(), *r2.graph(), nullptr, tbi.cc)) {
+        if (!mergeNfaPair(*r1.graph(), *r2.graph(), nullptr, build.cc)) {
             DEBUG_PRINTF("nfa merge failed\n");
             return false;
         }
 
-        // The graph in r1 has been merged into the graph in r2. Update r1's
-        // vertices with the new graph ptr. Since the parent vertices are the
-        // same, we know that tops will already have been distinct.
+        /* The graph in r1 has been merged into the graph in r2. Update r1's
+         * vertices with the new graph ptr. mergeNfaPair() does not alter the
+         * tops from the input graph so no need to update top values.
+         *
+         * It is the responsibility of the caller to ensure that the tops are
+         * distinct when they have different trigger conditions.
+         * [Note: mergeLeftfixesVariableLag() should have a common parent set]
+         */
         shared_ptr<NGHolder> &h = g[verts2.front()].left.graph;
         for (RoseVertex v : verts1) {
             g[v].left.graph = h;
@@ -1304,7 +1005,7 @@ bool mergeRosePair(RoseBuildImpl &tbi, left_id &r1, left_id &r2,
         return true;
     } else if (r1.castle()) {
         assert(r2.castle());
-        assert(tbi.cc.grey.allowCastle);
+        assert(build.cc.grey.allowCastle);
 
         map<u32, u32> top_map;
         if (!mergeCastle(*r2.castle(), *r1.castle(), top_map)) {
@@ -1328,59 +1029,195 @@ bool mergeRosePair(RoseBuildImpl &tbi, left_id &r1, left_id &r2,
     return false;
 }
 
+/**
+ * Checks that there is no problem due to the involved vertices if we merge two
+ * leftfix engines.
+ *
+ * This functions takes the vertices on the right of the two engines.
+ *
+ * Unlike mergeableRoseVertices(), this does not:
+ * - check that engines themselves can be merged
+ * - use heuristics to find out if merging the engines is wise.
+ */
 static
-void processMergeQueue(RoseBuildImpl &tbi, RoseBouquet &roses,
-                       priority_queue<RoseMergeCandidate> &pq) {
-    ue2::unordered_set<left_id> dead;
+bool checkVerticesOkForLeftfixMerge(const RoseBuildImpl &build,
+                                    const vector<RoseVertex> &targets_1,
+                                    const vector<RoseVertex> &targets_2) {
+    assert(!targets_1.empty());
+    assert(!targets_2.empty());
 
-    DEBUG_PRINTF("merge queue has %zu entries\n", pq.size());
-
-    while (!pq.empty()) {
-        DEBUG_PRINTF("pq pop h1=%p, h2=%p, cpl=%u, states=%u\n",
-                     pq.top().r1.graph(), pq.top().r2.graph(), pq.top().cpl,
-                     pq.top().states);
-
-        left_id r1 = pq.top().r1, r2 = pq.top().r2;
-        pq.pop();
-
-        if (contains(dead, r1) || contains(dead, r2)) {
-            continue;
+    vector<pair<const rose_literal_id *, u32>> ulits; /* lit + lag pairs */
+    for (auto a : targets_1) {
+        u32 ulag = build.g[a].left.lag;
+        for (u32 id : build.g[a].literals) {
+            ulits.emplace_back(&build.literals.at(id), ulag);
         }
-
-        if (r1.graph() && r2.graph()) {
-            NGHolder *h1 = r1.graph(), *h2 = r2.graph();
-            CharReach stop1 = findStopAlphabet(*h1, SOM_NONE);
-            CharReach stop2 = findStopAlphabet(*h2, SOM_NONE);
-            CharReach stopboth(stop1 & stop2);
-            DEBUG_PRINTF("stop1=%zu, stop2=%zu, stopboth=%zu\n", stop1.count(),
-                         stop2.count(), stopboth.count());
-            if (stopboth.count() < 10 &&
-                (stop1.count() > 10 || stop2.count() > 10)) {
-                DEBUG_PRINTF("skip merge, would kill stop alphabet\n");
-                continue;
-            }
-            size_t maxstop = max(stop1.count(), stop2.count());
-            if (maxstop > 200 && stopboth.count() < 200) {
-                DEBUG_PRINTF("skip merge, would reduce stop alphabet\n");
-                continue;
-            }
-        }
-
-        const deque<RoseVertex> &verts1 = roses.vertices(r1);
-        const deque<RoseVertex> &verts2 = roses.vertices(r2);
-
-        if (!mergeableRoseVertices(tbi, verts1, verts2)) {
-            continue;
-        }
-
-        if (!mergeRosePair(tbi, r1, r2, verts1, verts2)) {
-            continue;
-        }
-
-        roses.insert(r2, verts1);
-        roses.erase(r1);
-        dead.insert(r1);
     }
+
+    vector<pair<const rose_literal_id *, u32>> vlits;
+    for (auto a : targets_2) {
+        u32 vlag = build.g[a].left.lag;
+        for (u32 id : build.g[a].literals) {
+            vlits.emplace_back(&build.literals.at(id), vlag);
+        }
+    }
+
+    if (!compatibleLiteralsForMerge(ulits, vlits)) {
+        return false;
+    }
+
+    // Check preds are compatible as well.
+    if (!checkPredDelays(build, targets_1, targets_2)
+        || !checkPredDelays(build, targets_2, targets_1)) {
+        return false;
+    }
+
+    DEBUG_PRINTF("vertex sets are mergeable\n");
+    return true;
+}
+
+/**
+ * In block mode, we want to be a little more selective -- we will only merge
+ * prefix engines when the literal sets are the same or if the merged graph
+ * has only grown by a small amount.
+ */
+static
+bool goodBlockModeMerge(const RoseBuildImpl &build,
+                        const vector<RoseVertex> &u_verts, const left_id &u_eng,
+                        const vector<RoseVertex> &v_verts,
+                        const left_id &v_eng) {
+    assert(!build.cc.streaming);
+
+    // Always merge infixes if we can (subject to the other criteria in
+    // mergeableRoseVertices).
+    if (!build.isRootSuccessor(u_verts.front())) {
+        return true;
+    }
+
+    const RoseGraph &g = build.g;
+
+    flat_set<u32> u_lits;
+    for (RoseVertex u : u_verts) {
+        insert(&u_lits, g[u].literals);
+    }
+
+    flat_set<u32> v_lits;
+    for (RoseVertex v : v_verts) {
+        insert(&v_lits, g[v].literals);
+    }
+
+    // Merge prefixes with identical literal sets (as we'd have to run them
+    // both when we see those literals anyway).
+    if (u_lits == v_lits) {
+        return true;
+    }
+
+    // The rest of this function only deals with the case when have graph
+    // leftfixes.
+    if (!u_eng.graph()) {
+        return false;
+    }
+    assert(v_eng.graph());
+    const NGHolder &ug = *u_eng.graph();
+    const NGHolder &vg = *v_eng.graph();
+
+    size_t u_count = num_vertices(ug);
+    size_t v_count = num_vertices(vg);
+    DEBUG_PRINTF("u prefix has %zu vertices, v prefix has %zu vertices\n",
+                 u_count, v_count);
+    if (u_count > MAX_BLOCK_PREFIX_MERGE_VERTICES ||
+        v_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
+        DEBUG_PRINTF("prefixes too big already\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("trying merge\n");
+    NGHolder h;
+    cloneHolder(h, vg);
+    if (!mergeNfaPair(ug, h, nullptr, build.cc)) {
+        DEBUG_PRINTF("couldn't merge\n");
+        return false;
+    }
+
+    const size_t merged_count = num_vertices(h);
+    DEBUG_PRINTF("merged result has %zu vertices\n", merged_count);
+    if (merged_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
+        DEBUG_PRINTF("exceeded limit\n");
+        return false;
+    }
+
+    // We want to only perform merges that take advantage of some
+    // commonality in the two input graphs, so we check that the number of
+    // vertices has only grown a small amount: somewhere between the sum
+    // (no commonality) and the max (no growth at all) of the vertex counts
+    // of the input graphs.
+    size_t max_size = u_count + v_count;
+    size_t min_size = max(u_count, v_count);
+    size_t max_growth = ((max_size - min_size) * 25) / 100;
+    if (merged_count > min_size + max_growth) {
+        DEBUG_PRINTF("grew too much\n");
+        return false;
+    }
+
+    // We don't want to squander any chances at accelerating.
+    if (!isAccelerableLeftfix(build, h)
+        && (isAccelerableLeftfix(build, ug)
+            || isAccelerableLeftfix(build, vg))) {
+        DEBUG_PRINTF("would lose accel property\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("safe to merge\n");
+    return true;
+}
+
+/**
+ * Merge r1 into r2 if safe and appropriate. Returns true on success.
+ */
+static
+bool mergeLeftVL_tryMergeCandidate(RoseBuildImpl &build, left_id &r1,
+                                   const vector<RoseVertex> &targets_1,
+                                   left_id &r2,
+                                   const vector<RoseVertex> &targets_2) {
+    if (targets_1.empty() || targets_2.empty()) {
+        /* one of the engines has already been merged away */
+        return false;
+    }
+
+    assert(!r1.graph() == !r2.graph());
+    if (r1.graph()) {
+        NGHolder *h1 = r1.graph();
+        NGHolder *h2 = r2.graph();
+        CharReach stop1 = findStopAlphabet(*h1, SOM_NONE);
+        CharReach stop2 = findStopAlphabet(*h2, SOM_NONE);
+        CharReach stopboth = stop1 & stop2;
+        DEBUG_PRINTF("stop1=%zu, stop2=%zu, stopboth=%zu\n", stop1.count(),
+                     stop2.count(), stopboth.count());
+        if (stopboth.count() < 10
+            && (stop1.count() > 10 || stop2.count() > 10)) {
+            DEBUG_PRINTF("skip merge, would kill stop alphabet\n");
+            return false;
+        }
+        size_t maxstop = max(stop1.count(), stop2.count());
+        if (maxstop > 200 && stopboth.count() < 200) {
+            DEBUG_PRINTF("skip merge, would reduce stop alphabet\n");
+            return false;
+        }
+    }
+
+    /* Rechecking that the targets are compatible, as we may have already
+     * merged new states into r1 or r2 and we need to verify that this
+     * candidate is still ok. */
+    if (!checkVerticesOkForLeftfixMerge(build, targets_1, targets_2)) {
+        return false;
+    }
+
+    if (!build.cc.streaming
+        && !goodBlockModeMerge(build, targets_1, r1, targets_2, r2)) {
+        return false;
+    }
+
+    return mergeLeftfixPair(build, r1, r2, targets_1, targets_2);
 }
 
 static
@@ -1405,30 +1242,6 @@ bool nfaHasFiniteMaxWidth(const NGHolder &g) {
     return findMaxWidth(g).is_finite();
 }
 
-namespace {
-struct RoseMergeKey {
-    RoseMergeKey(const set<RoseVertex> &parents_in,
-                 bool narrowStart_in, bool hasMaxWidth_in) :
-                        narrowStart(narrowStart_in),
-                        hasMaxWidth(hasMaxWidth_in),
-                        parents(parents_in) {}
-    bool operator<(const RoseMergeKey &b) const {
-        const RoseMergeKey &a = *this;
-        ORDER_CHECK(narrowStart);
-        ORDER_CHECK(hasMaxWidth);
-        ORDER_CHECK(parents);
-        return false;
-    }
-
-    // NOTE: these two bool discriminators are only used for prefixes, not
-    // infixes.
-    bool narrowStart;
-    bool hasMaxWidth;
-
-    set<RoseVertex> parents;
-};
-}
-
 static
 bool hasReformedStartDotStar(const NGHolder &h, const Grey &grey) {
     if (!proper_out_degree(h.startDs, h)) {
@@ -1461,6 +1274,84 @@ u32 commonPrefixLength(left_id &r1, left_id &r2) {
     return 0;
 }
 
+namespace {
+struct MergeKey {
+    MergeKey(const left_id &left, flat_set<RoseVertex> parents_in) :
+        parents(std::move(parents_in)) {
+
+        // We want to distinguish prefixes (but not infixes) on whether they
+        // have a narrow start or max width.
+        if (left.graph() && !is_triggered(*left.graph())) {
+            const NGHolder &h = *left.graph();
+            narrowStart = nfaHasNarrowStart(h);
+            hasMaxWidth = nfaHasFiniteMaxWidth(h);
+        } else {
+            narrowStart = false;
+            hasMaxWidth = false;
+        }
+
+        if (left.castle()) {
+            /* castles should have a non-empty reach */
+            assert(left.castle()->reach().any());
+            castle_cr = left.castle()->reach();
+        } else {
+            assert(left.graph());
+        }
+    }
+
+    bool operator<(const MergeKey &b) const {
+        const MergeKey &a = *this;
+        ORDER_CHECK(narrowStart);
+        ORDER_CHECK(hasMaxWidth);
+        ORDER_CHECK(castle_cr);
+        ORDER_CHECK(parents);
+        return false;
+    }
+
+    // NOTE: these two bool discriminators are only used for prefixes, not
+    // infixes.
+    bool narrowStart;
+    bool hasMaxWidth;
+    CharReach castle_cr; /* empty for graphs, reach (non-empty) for castles. */
+
+    flat_set<RoseVertex> parents;
+};
+}
+
+template <typename T>
+static
+void chunk(vector<T> in, vector<vector<T>> *out, size_t chunk_size) {
+    if (in.size() <= chunk_size) {
+        out->push_back(std::move(in));
+        return;
+    }
+
+    out->push_back(vector<T>());
+    out->back().reserve(chunk_size);
+    for (const auto &t : in) {
+        if (out->back().size() >= chunk_size) {
+            out->push_back(vector<T>());
+            out->back().reserve(chunk_size);
+        }
+        out->back().push_back(std::move(t));
+    }
+}
+
+static
+insertion_ordered_map<left_id, vector<RoseVertex>> get_eng_verts(RoseGraph &g) {
+    insertion_ordered_map<left_id, vector<RoseVertex>> eng_verts;
+    for (auto v : vertices_range(g)) {
+        const auto &left = g[v].left;
+        if (!left) {
+            continue;
+        }
+        assert(contains(all_reports(left), left.leftfix_report));
+        eng_verts[left].push_back(v);
+    }
+
+    return eng_verts;
+}
+
 /**
  * This pass attempts to merge prefix/infix engines which share a common set of
  * parent vertices.
@@ -1472,7 +1363,9 @@ u32 commonPrefixLength(left_id &r1, left_id &r2) {
  * the stop alphabet.
  *
  * Infixes:
- * - LBR candidates are not considered.
+ * - It is expected that when this is run all infixes are still at the single
+ *   top stage as we have not yet merged unrelated infixes together. After
+ *   execution, castles may have multiple (but equivalent) tops.
  *
  * Prefixes:
  * - transient prefixes are not considered.
@@ -1482,136 +1375,132 @@ u32 commonPrefixLength(left_id &r1, left_id &r2) {
  * - merges are not considered in cases where dot star start state will be
  *   reformed to optimise a leading repeat.
  */
-void mergeLeftfixesVariableLag(RoseBuildImpl &tbi) {
-    if (!tbi.cc.grey.mergeRose) {
+void mergeLeftfixesVariableLag(RoseBuildImpl &build) {
+    if (!build.cc.grey.mergeRose) {
         return;
     }
+    assert(!hasOrphanedTops(build));
 
-    map<RoseMergeKey, RoseBouquet> rosesByParent;
-    RoseGraph &g = tbi.g;
-    set<RoseVertex> parents;
+    RoseGraph &g = build.g;
 
     DEBUG_PRINTF("-----\n");
     DEBUG_PRINTF("entry\n");
     DEBUG_PRINTF("-----\n");
 
-    for (auto v : vertices_range(g)) {
-        if (!g[v].left) {
-            continue;
-        }
-
-        const bool is_prefix = tbi.isRootSuccessor(v);
+    auto eng_verts = get_eng_verts(g);
 
+    map<MergeKey, vector<left_id>> engine_groups;
+    for (const auto &e : eng_verts) {
+        const left_id &left = e.first;
+        const auto &verts = e.second;
         // Only non-transient for the moment.
-        if (contains(tbi.transient, g[v].left)) {
+        if (contains(build.transient, left)) {
             continue;
         }
 
         // No forced McClellan or Haig infix merges.
-        if (g[v].left.dfa || (!is_prefix && g[v].left.haig)) {
+        if (left.dfa() || left.haig()) {
             continue;
         }
+        assert(left.graph() || left.castle());
 
-        if (g[v].left.graph) {
-            NGHolder &h = *g[v].left.graph;
+        if (left.graph()) {
+            const NGHolder &h = *left.graph();
+            /* we should not have merged yet */
+            assert(!is_triggered(h) || onlyOneTop(h));
 
-            /* Ensure that kind on the graph is correct */
-            assert(h.kind == (is_prefix ? NFA_PREFIX : NFA_INFIX));
-
-            if (hasReformedStartDotStar(h, tbi.cc.grey)) {
+            if (hasReformedStartDotStar(h, build.cc.grey)) {
                 continue; // preserve the optimisation of the leading repeat
             }
+        } else {
+            assert(left.castle());
 
-            if (!is_prefix && isLargeLBR(h, tbi.cc.grey)) {
+            if (!build.cc.grey.allowCastle) {
+                DEBUG_PRINTF("castle merging disallowed by greybox\n");
                 continue;
             }
         }
 
-        if (g[v].left.castle && !tbi.cc.grey.allowCastle) {
-            DEBUG_PRINTF("castle merging disallowed by greybox\n");
-            continue;
-        }
-
         // We collapse the anchored root into the root vertex when calculating
         // parents, so that we can merge differently-anchored prefix roses
         // together. (Prompted by UE-2100)
-        parents.clear();
-        for (auto u : inv_adjacent_vertices_range(v, g)) {
-            if (tbi.isAnyStart(u)) {
-                parents.insert(tbi.root);
-            } else {
-                parents.insert(u);
-            }
+
+        flat_set<RoseVertex> parents;
+        for (RoseVertex v : verts) {
+            insert(&parents, inv_adjacent_vertices_range(v, g));
         }
 
-        if (parents.empty()) {
-            assert(0);
-            continue;
+        if (contains(parents, build.anchored_root)) {
+            parents.erase(build.anchored_root);
+            parents.insert(build.root);
         }
 
-        // We want to distinguish prefixes (but not infixes) on whether they
-        // have a narrow start or max width.
-        bool narrowStart = false, hasMaxWidth = false;
-        if (is_prefix && g[v].left.graph) {
-            const NGHolder &h = *g[v].left.graph;
-            narrowStart = nfaHasNarrowStart(h);
-            hasMaxWidth = nfaHasFiniteMaxWidth(h);
-        }
+        assert(!parents.empty());
 
-        RoseMergeKey key(parents, narrowStart, hasMaxWidth);
-        rosesByParent[key].insert(g[v].left, v);
+        engine_groups[MergeKey(left, parents)].push_back(left);
     }
 
-    for (auto &m : rosesByParent) {
-        if (m.second.size() < 2) {
+    vector<vector<left_id>> chunks;
+    for (auto &raw_group : engine_groups | map_values) {
+        chunk(move(raw_group), &chunks, MERGE_GROUP_SIZE_MAX);
+    }
+    engine_groups.clear();
+
+    DEBUG_PRINTF("chunked roses into %zu groups\n", chunks.size());
+
+    for (auto &roses : chunks) {
+        if (roses.size() < 2) {
             continue;
         }
+        // All pairs on the prio queue.
+        u32 tie_breaker = 0;
+        priority_queue<RoseMergeCandidate> pq;
+        for (auto it = roses.begin(), ite = roses.end(); it != ite; ++it) {
+            left_id r1 = *it;
+            const vector<RoseVertex> &targets_1 = eng_verts[r1];
 
-        deque<RoseBouquet> rose_groups;
-        chunkBouquets(m.second, rose_groups, MERGE_GROUP_SIZE_MAX);
-        m.second.clear();
-        DEBUG_PRINTF("chunked roses into %zu groups\n", rose_groups.size());
+            for (auto jt = next(it); jt != ite; ++jt) {
+                left_id r2 = *jt;
 
-        for (auto &roses : rose_groups) {
-            // All pairs on the prio queue.
-            u32 tie_breaker = 0;
-            priority_queue<RoseMergeCandidate> pq;
-            for (auto it = roses.begin(), ite = roses.end(); it != ite; ++it) {
-                left_id r1 = *it;
-                const deque<RoseVertex> &verts1 = roses.vertices(r1);
+                /* we should have already split on engine types and reach */
+                assert(!r1.castle() == !r2.castle());
+                assert(!r1.graph() == !r2.graph());
+                assert(!r1.castle()
+                       || r1.castle()->reach() == r2.castle()->reach());
 
-                for (auto jt = next(it); jt != ite; ++jt) {
-                    left_id r2 = *jt;
-
-                    // Roses must be of the same engine type to be mergeable.
-                    if ((!r1.graph() != !r2.graph()) ||
-                        (!r1.castle() != !r2.castle())) {
-                        continue;
-                    }
-
-                    // Castles must have the same reach to be mergeable.
-                    if (r1.castle()) {
-                        if (r1.castle()->reach() != r2.castle()->reach()) {
-                            continue;
-                        }
-                    }
-
-                    const deque<RoseVertex> &verts2 = roses.vertices(r2);
-                    if (!mergeableRoseVertices(tbi, verts1, verts2)) {
-                        continue; // No point queueing unmergeable cases.
-                    }
-
-                    u32 cpl = commonPrefixLength(r1, r2);
-                    pq.push(RoseMergeCandidate(r1, r2, cpl, tie_breaker++));
+                const vector<RoseVertex> &targets_2 = eng_verts[r2];
+                if (!checkVerticesOkForLeftfixMerge(build, targets_1,
+                                                    targets_2)) {
+                    continue; // No point queueing unmergeable cases.
                 }
+
+                u32 cpl = commonPrefixLength(r1, r2);
+                pq.push(RoseMergeCandidate(r1, r2, cpl, tie_breaker++));
+            }
+        }
+
+        DEBUG_PRINTF("merge queue has %zu entries\n", pq.size());
+
+        while (!pq.empty()) {
+            left_id r1 = pq.top().r1;
+            left_id r2 = pq.top().r2;
+            DEBUG_PRINTF("pq pop h1=%p, h2=%p, cpl=%u, states=%u\n",
+                         r1.graph(), r2.graph(), pq.top().cpl, pq.top().states);
+            pq.pop();
+            vector<RoseVertex> &targets_1 = eng_verts[r1];
+            vector<RoseVertex> &targets_2 = eng_verts[r2];
+            if (mergeLeftVL_tryMergeCandidate(build, r1, targets_1, r2,
+                                              targets_2)) {
+                insert(&targets_2, targets_2.end(), targets_1);
+                targets_1.clear();
             }
-            processMergeQueue(tbi, roses, pq);
         }
     }
 
     DEBUG_PRINTF("-----\n");
     DEBUG_PRINTF("exit\n");
     DEBUG_PRINTF("-----\n");
+    assert(!hasOrphanedTops(build));
 }
 
 namespace {
@@ -1620,16 +1509,15 @@ namespace {
  * Key used to group sets of leftfixes for the dedupeLeftfixesVariableLag path.
  */
 struct DedupeLeftKey {
-    DedupeLeftKey(const RoseBuildImpl &build, RoseVertex v)
-        : left_hash(hashLeftfix(build.g[v].left)) {
-        const auto &g = build.g;
-        for (const auto &e : in_edges_range(v, g)) {
-            preds.emplace(g[source(e, g)].index, g[e].rose_top);
-        }
+    DedupeLeftKey(const RoseBuildImpl &build,
+                  flat_set<pair<size_t, u32>> preds_in, const left_id &left)
+        : left_hash(hashLeftfix(left)), preds(move(preds_in)),
+          transient(contains(build.transient, left)) {
     }
 
     bool operator<(const DedupeLeftKey &b) const {
-        return tie(left_hash, preds) < tie(b.left_hash, b.preds);
+        return tie(left_hash, preds, transient)
+             < tie(b.left_hash, b.preds, b.transient);
     }
 
 private:
@@ -1638,11 +1526,23 @@ private:
     size_t left_hash;
 
     /** For each in-edge, the pair of (parent index, edge top). */
-    set<pair<size_t, u32>> preds;
+    flat_set<pair<size_t, u32>> preds;
+
+    /** We don't want to combine transient with non-transient. */
+    bool transient;
 };
 
 } // namespace
 
+static
+flat_set<pair<size_t, u32>> get_pred_tops(RoseVertex v, const RoseGraph &g) {
+    flat_set<pair<size_t, u32>> preds;
+    for (const auto &e : in_edges_range(v, g)) {
+        preds.emplace(g[source(e, g)].index, g[e].rose_top);
+    }
+    return preds;
+}
+
 /**
  * This is a generalisation of \ref dedupeLeftfixes which relaxes two
  * restrictions: multiple predecessor roles are allowed and the delay used by
@@ -1660,77 +1560,99 @@ private:
  *    successor may want to inspect it; the overlap relationships between the
  *    involved literals are examined to ensure that this property holds.
  *
- * Note: in block mode we restrict the dedupe of prefixes further as some of
- * logic checks are shared with the mergeLeftfix functions.
+ * Note: this is unable to dedupe when delayed literals are involved unlike
+ * dedupeLeftfixes.
  */
-void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi) {
-    map<DedupeLeftKey, RoseBouquet> roseGrouping;
-
+void dedupeLeftfixesVariableLag(RoseBuildImpl &build) {
     DEBUG_PRINTF("entry\n");
 
-    RoseGraph &g = tbi.g;
-    for (auto v : vertices_range(g)) {
-        if (!g[v].left) {
+    RoseGraph &g = build.g;
+    auto eng_verts = get_eng_verts(g);
+
+    map<DedupeLeftKey, vector<left_id>> engine_groups;
+    for (const auto &e : eng_verts) {
+        const left_id &left = e.first;
+        const auto &verts = e.second;
+
+        /* There should only be one report on an engine as no merges have
+         * happened yet. (aside from eod prefixes) */
+        if (all_reports(left).size() != 1) {
+            assert(any_of_in(adjacent_vertices_range(verts.front(), g),
+                             [&](RoseVertex w) { return g[w].eod_accept; }));
             continue;
         }
 
-        const left_id leftfix(g[v].left);
-
-        // Only non-transient for the moment.
-        if (contains(tbi.transient, leftfix)) {
+         if (left.haig()) {
+            /* TODO: allow deduping of identical haigs */
             continue;
         }
 
-        if (leftfix.haig()) {
-            /* TODO: allow merging of identical haigs */
-            continue;
+        if (left.graph()) {
+            /* we should not have merged yet */
+            assert(!is_triggered(*left.graph()) || onlyOneTop(*left.graph()));
         }
 
-        roseGrouping[DedupeLeftKey(tbi, v)].insert(leftfix, v);
+        auto preds = get_pred_tops(verts.front(), g);
+        for (RoseVertex v : verts) {
+            if (preds != get_pred_tops(v, g)) {
+                DEBUG_PRINTF("distinct pred sets\n");
+                continue;
+            }
+        }
+        engine_groups[DedupeLeftKey(build, move(preds), left)].push_back(left);
     }
 
-    for (RoseBouquet &roses : roseGrouping | map_values) {
-        DEBUG_PRINTF("group of %zu roses\n", roses.size());
+    /* We don't bother chunking as we expect deduping to be successful if the
+     * hashes match */
 
-        if (roses.size() < 2) {
+    for (auto &group : engine_groups | map_values) {
+        DEBUG_PRINTF("group of %zu roses\n", group.size());
+
+        if (group.size() < 2) {
             continue;
         }
 
-        const RoseComparator rosecmp(g);
-
-        for (auto it = roses.begin(); it != roses.end(); ++it) {
+        for (auto it = group.begin(); it != group.end(); ++it) {
             left_id r1 = *it;
-            const deque<RoseVertex> &verts1 = roses.vertices(r1);
+            vector<RoseVertex> &verts1 = eng_verts[r1];
+            assert(!verts1.empty()); /* cleared engines should be behind us */
 
-            for (auto jt = next(it); jt != roses.end(); ++jt) {
+            assert(all_reports(r1).size() == 1);
+            ReportID r1_report = *all_reports(r1).begin();
+
+            for (auto jt = next(it); jt != group.end(); ++jt) {
                 left_id r2 = *jt;
-                const deque<RoseVertex> &verts2 = roses.vertices(r2);
+                vector<RoseVertex> &verts2 = eng_verts[r2];
+                assert(!verts2.empty());
+                assert(all_reports(r2).size() == 1);
+                ReportID r2_report = *all_reports(r2).begin();
 
-                if (!rosecmp(verts1.front(), verts2.front())) {
+                if (!is_equal(r1, r1_report, r2, r2_report)) {
                     continue;
                 }
 
-                if (!mergeableRoseVertices(tbi, verts1, verts2)) {
+                if (!checkVerticesOkForLeftfixMerge(build, verts1, verts2)) {
                     continue;
                 }
 
                 DEBUG_PRINTF("%p and %p are dupes\n", r1.graph(), r2.graph());
 
-                // Replace h1 with h2.
-
-                const LeftEngInfo &v2_left = g[verts2.front()].left;
-                assert(v2_left.graph.get() == r2.graph());
+                // Replace r1 with r2.
 
                 for (auto v : verts1) {
                     DEBUG_PRINTF("replacing report %u with %u on %zu\n",
-                                 g[v].left.leftfix_report,
-                                 v2_left.leftfix_report, g[v].index);
+                                 r2_report, r1_report, g[v].index);
                     u32 orig_lag = g[v].left.lag;
-                    g[v].left = v2_left;
+                    g[v].left = g[verts2.front()].left;
                     g[v].left.lag = orig_lag;
                 }
-                roses.insert(r2, verts1);
-                // no need to erase h1 from roses, that would invalidate `it'.
+
+                insert(&verts2, verts2.end(), verts1);
+                verts1.clear();
+
+                /* remove stale entry from transient set, if present */
+                build.transient.erase(r1);
+
                 break;
             }
         }
@@ -1738,7 +1660,7 @@ void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi) {
 }
 
 static
-u32 findUnusedTop(const ue2::flat_set<u32> &tops) {
+u32 findUnusedTop(const flat_set<u32> &tops) {
     u32 i = 0;
     while (contains(tops, i)) {
         i++;
@@ -1766,7 +1688,7 @@ void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
 static
 bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
                      map<u32, u32> &top_mapping) {
-    ue2::flat_set<u32> tops1 = getTops(h1), tops2 = getTops(h2);
+    flat_set<u32> tops1 = getTops(h1), tops2 = getTops(h2);
 
     DEBUG_PRINTF("before: h1 has %zu tops, h2 has %zu tops\n", tops1.size(),
                  tops2.size());
@@ -1862,7 +1784,7 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
 
     // We track the number of accelerable states for each graph in a map and
     // only recompute them when the graph is modified.
-    ue2::unordered_map<left_id, u32> accel_count;
+    unordered_map<left_id, u32> accel_count;
     for (const auto &rose : roses) {
         assert(rose.graph()->kind == NFA_INFIX);
         accel_count[rose] = estimatedAccelStates(tbi, *rose.graph());
@@ -1954,66 +1876,6 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
     }
 }
 
-static
-void mergeCastleChunk(RoseBuildImpl &tbi, RoseBouquet &cands) {
-    /* caller must have already ensured that candidates have the same reach */
-    RoseGraph &g = tbi.g;
-    DEBUG_PRINTF("%zu castle rose merge candidates\n", cands.size());
-
-    deque<left_id> merged;
-
-    for (auto it = cands.begin(); it != cands.end(); ++it) {
-        left_id r1 = *it;
-        CastleProto &castle1 = *r1.castle();
-        const deque<RoseVertex> &verts1 = cands.vertices(r1);
-
-        merged.clear();
-
-        for (auto jt = next(it); jt != cands.end(); ++jt) {
-            left_id r2 = *jt;
-            CastleProto &castle2 = *r2.castle();
-            const deque<RoseVertex> &verts2 = cands.vertices(r2);
-
-            if (castle1.repeats.size() == castle1.max_occupancy) {
-                DEBUG_PRINTF("castle1 has hit max occupancy\n");
-                break; // next castle1
-            }
-
-            assert(castle1.reach() == castle2.reach());
-
-            if (!mergeableRoseVertices(tbi, verts1, verts2)) {
-                DEBUG_PRINTF("not mergeable\n");
-                continue; // next castle2
-            }
-
-            DEBUG_PRINTF("castle1=%p (size %zu), castle2=%p (size %zu)\n",
-                         &castle1, castle1.repeats.size(), &castle2,
-                         castle2.repeats.size());
-
-            map<u32, u32> top_map;
-            if (!mergeCastle(castle1, castle2, top_map)) {
-                DEBUG_PRINTF("couldn't merge\n");
-                continue; // next castle2
-            }
-
-            // Update castle2's roses to point to castle1 now.
-            shared_ptr<CastleProto> winner = g[verts1.front()].left.castle;
-            for (auto v : verts2) {
-                g[v].left.castle = winner;
-                for (const auto &e : in_edges_range(v, g)) {
-                    g[e].rose_top = top_map.at(g[e].rose_top);
-                }
-            }
-
-            cands.insert(r1, verts2);
-            merged.push_back(r2);
-        }
-
-        DEBUG_PRINTF("%zu roses merged\n", merged.size());
-        cands.erase_all(merged.begin(), merged.end());
-    }
-}
-
 /**
  * This pass attempts to merge prefix/infix engines with a small number of
  * vertices together into larger engines. The engines must not be have a
@@ -2072,11 +1934,6 @@ void mergeSmallLeftfixes(RoseBuildImpl &tbi) {
             continue;
         }
 
-        // Don't merge cases that will become LBRs or haigs.
-        if (isLargeLBR(h, tbi.cc.grey)) {
-            continue;
-        }
-
         // Small roses only.
         if (num_vertices(h) > small_rose_threshold(tbi.cc)) {
             continue;
@@ -2095,55 +1952,110 @@ void mergeSmallLeftfixes(RoseBuildImpl &tbi) {
     }
 }
 
-void mergeCastleLeftfixes(RoseBuildImpl &tbi) {
+
+static
+void mergeCastleChunk(RoseBuildImpl &build, vector<left_id> &cands,
+                insertion_ordered_map<left_id, vector<RoseVertex>> &eng_verts) {
+    /* caller must have already ensured that candidates have the same reach */
+    RoseGraph &g = build.g;
+    DEBUG_PRINTF("%zu castle leftfix merge candidates\n", cands.size());
+
+    for (auto it = cands.begin(); it != cands.end(); ++it) {
+        left_id &cand_1 = *it;
+        vector<RoseVertex> &verts_1 = eng_verts[cand_1];
+        if (verts_1.empty()) {
+            continue;
+        }
+
+        for (auto jt = next(it); jt != cands.end(); ++jt) {
+            const left_id &cand_2 = *jt;
+            vector<RoseVertex> &verts_2 = eng_verts[cand_2];
+            if (verts_2.empty()) {
+                continue;
+            }
+
+            assert(cand_1.castle()->reach() == cand_2.castle()->reach());
+
+            if (!checkVerticesOkForLeftfixMerge(build, verts_1, verts_2)) {
+                DEBUG_PRINTF("not mergeable\n");
+                continue; // next cand_2
+            }
+
+            DEBUG_PRINTF("castle1=%p (size %zu)\n", cand_1.castle(),
+                         cand_1.castle()->repeats.size());
+            DEBUG_PRINTF("castle2=%p (size %zu)\n", cand_2.castle(),
+                         cand_2.castle()->repeats.size());
+
+            map<u32, u32> top_map;
+            if (!mergeCastle(*cand_1.castle(), *cand_2.castle(), top_map)) {
+                DEBUG_PRINTF("couldn't merge\n");
+                continue; // next cand_2
+            }
+
+            // Update castle2's roses to point to castle1 now.
+            shared_ptr<CastleProto> winner = g[verts_1.front()].left.castle;
+            for (auto v : verts_2) {
+                assert(g[v].left.castle.get() == cand_2.castle());
+                g[v].left.castle = winner;
+                for (const auto &e : in_edges_range(v, g)) {
+                    g[e].rose_top = top_map.at(g[e].rose_top);
+                }
+            }
+
+            insert(&verts_1, verts_1.end(), verts_2);
+            verts_2.clear();
+        }
+    }
+}
+
+/**
+ * Merges castles with the same reach together regardless of where in the rose
+ * graph they are. Note: there is no requirement for the castles to have common
+ * parent or target vertices.
+ *
+ * There are no heuristics for reducing block mode merges as castle speed
+ * mainly depends on the reach being scanned.
+ */
+void mergeCastleLeftfixes(RoseBuildImpl &build) {
     DEBUG_PRINTF("entry\n");
 
-    if (!tbi.cc.grey.mergeRose || !tbi.cc.grey.roseMultiTopRoses ||
-        !tbi.cc.grey.allowCastle) {
+    if (!build.cc.grey.mergeRose || !build.cc.grey.roseMultiTopRoses
+        || !build.cc.grey.allowCastle) {
         return;
     }
 
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
-    map<CharReach, RoseBouquet> by_reach;
+    insertion_ordered_map<left_id, vector<RoseVertex>> eng_verts;
 
     for (auto v : vertices_range(g)) {
-        if (!g[v].left) {
+        if (!g[v].left.castle) {
             continue;
         }
 
-        // Handle single-parent infixes only.
-        if (tbi.isRootSuccessor(v)) {
+        // Handle infixes only.
+        if (build.isRootSuccessor(v)) {
             continue;
         }
 
-        const left_id left(g[v].left);
-
-        // Only non-transient for the moment.
-        if (contains(tbi.transient, left)) {
-            continue;
-        }
-
-        if (!left.castle()) {
-            continue;
-        }
-
-        const CastleProto &castle = *left.castle();
-        const CharReach &cr = castle.reach();
-        by_reach[cr].insert(left, v);
+        eng_verts[g[v].left].push_back(v);
     }
 
-    for (auto &m : by_reach) {
-        DEBUG_PRINTF("%zu castles for reach: %s\n", m.second.size(),
-                     describeClass(m.first).c_str());
-        RoseBouquet &candidates = m.second;
-        deque<RoseBouquet> cand_groups;
-        chunkBouquets(candidates, cand_groups, MERGE_CASTLE_GROUP_SIZE_MAX);
-        candidates.clear();
+    map<CharReach, vector<left_id>> by_reach;
+    for (const auto &left : eng_verts | map_keys) {
+        by_reach[left.castle()->reach()].push_back(left);
+    }
 
-        for (auto &group : cand_groups) {
-            mergeCastleChunk(tbi, group);
-        }
+    vector<vector<left_id>> chunks;
+    for (auto &raw_group : by_reach | map_values) {
+        chunk(move(raw_group), &chunks, MERGE_CASTLE_GROUP_SIZE_MAX);
+    }
+    by_reach.clear();
+
+    DEBUG_PRINTF("chunked castles into %zu groups\n", chunks.size());
+
+    for (auto &chunk : chunks) {
+        mergeCastleChunk(build, chunk, eng_verts);
     }
 }
 
@@ -2157,7 +2069,7 @@ void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
     // If this isn't an acyclic case, we track the number of accelerable states
     // for each graph in a map and only recompute them when the graph is
     // modified.
-    ue2::unordered_map<suffix_id, u32> accel_count;
+    unordered_map<suffix_id, u32> accel_count;
     if (!acyclic) {
         for (const auto &suffix : suffixes) {
             assert(suffix.graph() && suffix.graph()->kind == NFA_SUFFIX);
@@ -2294,11 +2206,6 @@ void mergeAcyclicSuffixes(RoseBuildImpl &tbi) {
             continue;
         }
 
-        if (isLargeLBR(*h, tbi.cc.grey)) {
-            DEBUG_PRINTF("not considering LBR suffix for merge\n");
-            continue;
-        }
-
         suffixes.insert(g[v].suffix, v);
     }
 
@@ -2361,11 +2268,6 @@ void mergeSmallSuffixes(RoseBuildImpl &tbi) {
             continue;
         }
 
-        if (isLargeLBR(*h, tbi.cc.grey)) {
-            DEBUG_PRINTF("not considering LBR suffix for merge\n");
-            continue;
-        }
-
         suffixes.insert(g[v].suffix, v);
     }
 
@@ -2499,7 +2401,7 @@ private:
 template<class RawDfa, class MergeFunctor>
 static
 void pairwiseDfaMerge(vector<RawDfa *> &dfas,
-                      ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
+                      unordered_map<RawDfa *, size_t> &dfa_mapping,
                       vector<OutfixInfo> &outfixes,
                       MergeFunctor merge_func) {
     DEBUG_PRINTF("merging group of size %zu\n", dfas.size());
@@ -2541,7 +2443,7 @@ void pairwiseDfaMerge(vector<RawDfa *> &dfas,
 template<class RawDfa, class MergeFunctor>
 static
 void chunkedDfaMerge(vector<RawDfa *> &dfas,
-                     ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
+                     unordered_map<RawDfa *, size_t> &dfa_mapping,
                      vector<OutfixInfo> &outfixes,
                      MergeFunctor merge_func) {
     DEBUG_PRINTF("begin merge of %zu dfas\n", dfas.size());
@@ -2575,7 +2477,7 @@ void mergeOutfixDfas(RoseBuildImpl &tbi, vector<raw_dfa *> &dfas) {
 
     /* key is index into outfix array as iterators, etc may be invalidated by
      * element addition. */
-    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
+    unordered_map<raw_dfa *, size_t> dfa_mapping;
     for (size_t i = 0; i < outfixes.size(); i++) {
         auto *rdfa = outfixes[i].rdfa();
         if (rdfa) {
@@ -2619,7 +2521,7 @@ void mergeOutfixCombo(RoseBuildImpl &tbi, const ReportManager &rm,
     /* key is index into outfix array as iterators, etc may be invalidated by
      * element addition. */
     size_t new_dfas = 0;
-    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
+    unordered_map<raw_dfa *, size_t> dfa_mapping;
     vector<raw_dfa *> dfas;
 
     for (auto it = tbi.outfixes.begin(); it != tbi.outfixes.end(); ++it) {
@@ -2670,7 +2572,7 @@ void mergeOutfixHaigs(RoseBuildImpl &tbi, vector<raw_som_dfa *> &dfas,
 
     vector<OutfixInfo> &outfixes = tbi.outfixes;
 
-    ue2::unordered_map<raw_som_dfa *, size_t> dfa_mapping;
+    unordered_map<raw_som_dfa *, size_t> dfa_mapping;
     for (size_t i = 0; i < outfixes.size(); i++) {
         auto *haig = outfixes[i].haig();
         if (haig) {
@@ -2822,8 +2724,8 @@ void mergePuffixes(RoseBuildImpl &tbi) {
 static
 void updateCastleSuffix(RoseGraph &g, const shared_ptr<CastleProto> &m,
                         u32 top, const vector<RoseVertex> &verts) {
-    DEBUG_PRINTF("merged in as top %u, updating %zu vertices\n", top,
-                  verts.size());
+    DEBUG_PRINTF("merged in as top %u of %p, updating %zu vertices\n", top,
+                  m.get(), verts.size());
 
     for (auto v : verts) {
         assert(g[v].suffix.castle);
@@ -2833,77 +2735,56 @@ void updateCastleSuffix(RoseGraph &g, const shared_ptr<CastleProto> &m,
 }
 
 static
-void mergeCastleSuffixes(RoseBuildImpl &tbi,
-            vector<shared_ptr<CastleProto> > &castles,
-            map<shared_ptr<CastleProto>, vector<RoseVertex> > &castle_map) {
+void mergeCastleSuffixChunk(RoseGraph &g, const vector<CastleProto *> &castles,
+            const unordered_map<CastleProto *, vector<RoseVertex>> &eng_verts) {
     if (castles.size() <= 1) {
         return;
     }
 
-    RoseGraph &g = tbi.g;
-    const size_t max_size = CastleProto::max_occupancy;
+    DEBUG_PRINTF("merging reach %s, %zu elements\n",
+                 describeClass(castles[0]->reach()).c_str(), castles.size());
 
-    shared_ptr<CastleProto> m = castles.front();
-    assert(m->repeats.size() == 1); // Not yet merged.
+    CastleProto *m = nullptr;
 
-    // Cache repeats we've already merged, mapped to (prototype, top). That
-    // way, we can ensure that we don't construct more than one completely
-    // identical repeat.
-    typedef map<PureRepeat, pair<shared_ptr<CastleProto>, u32> > RepeatCache;
-    RepeatCache cache;
-    {
-        // Initial entry in cache.
-        const u32 top = m->repeats.begin()->first;
-        const PureRepeat &pr = m->repeats.begin()->second;
-        cache[pr] = make_pair(m, top);
-    }
-
-    for (size_t i = 1; i < castles.size(); i++) {
-        shared_ptr<CastleProto> c = castles[i];
+    for (CastleProto *c : castles) {
         assert(c->repeats.size() == 1); // Not yet merged.
-        const PureRepeat &pr = c->repeats.begin()->second;
-        RepeatCache::const_iterator it = cache.find(pr);
-        if (it != cache.end()) {
-            DEBUG_PRINTF("reusing cached merge, top=%u, proto=%p\n",
-                         it->second.second, it->second.first.get());
-            updateCastleSuffix(g, it->second.first, it->second.second,
-                               castle_map[c]);
+        assert(g[eng_verts.at(c).front()].suffix.castle.get() == c);
+        if (!m) {
+            m = c;
             continue;
         }
 
-        if (m->repeats.size() == max_size) {
+        u32 top = m->merge(c->repeats[0]);
+        if (top == CastleProto::max_occupancy) {
             // No room left to merge into 'm'. This one becomes the new 'm'.
             DEBUG_PRINTF("next mergee\n");
             m = c;
-            u32 top = m->repeats.begin()->first;
-            cache[pr] = make_pair(m, top);
-        } else {
-            u32 top = m->add(pr);
-            updateCastleSuffix(g, m, top, castle_map[c]);
-            DEBUG_PRINTF("added to %p, top %u\n", m.get(), top);
-            cache[pr] = make_pair(m, top);
+            continue;
         }
+        updateCastleSuffix(g, g[eng_verts.at(m).front()].suffix.castle, top,
+                           eng_verts.at(c));
+        DEBUG_PRINTF("added to %p, top %u\n", m, top);
     }
 }
 
-void mergeCastleSuffixes(RoseBuildImpl &tbi) {
+void mergeCastleSuffixes(RoseBuildImpl &build) {
     DEBUG_PRINTF("entry\n");
 
-    if (!(tbi.cc.grey.allowCastle && tbi.cc.grey.mergeSuffixes)) {
+    if (!build.cc.grey.allowCastle || !build.cc.grey.mergeSuffixes) {
         return;
     }
 
-    map<shared_ptr<CastleProto>, vector<RoseVertex>> castles;
-    map<CharReach, vector<shared_ptr<CastleProto>>> by_reach;
+    unordered_map<CastleProto *, vector<RoseVertex>> eng_verts;
+    map<CharReach, vector<CastleProto *>> by_reach;
 
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
     for (auto v : vertices_range(g)) {
         if (!g[v].suffix.castle) {
             continue;
         }
 
-        shared_ptr<CastleProto> c = g[v].suffix.castle;
+        CastleProto *c = g[v].suffix.castle.get();
 
         if (c->repeats.size() != 1) {
             // This code assumes it's the only place merging is being done.
@@ -2911,16 +2792,14 @@ void mergeCastleSuffixes(RoseBuildImpl &tbi) {
             continue;
         }
 
-        if (!contains(castles, c)) {
+        if (!contains(eng_verts, c)) {
             by_reach[c->reach()].push_back(c);
         }
-        castles[c].push_back(v);
+        eng_verts[c].push_back(v);
     }
 
-    for (auto &m : by_reach) {
-        DEBUG_PRINTF("reach %s, %zu elements\n", describeClass(m.first).c_str(),
-                     m.second.size());
-        mergeCastleSuffixes(tbi, m.second, castles);
+    for (auto &chunk : by_reach | map_values) {
+        mergeCastleSuffixChunk(g, chunk, eng_verts);
     }
 }
 
diff --git a/src/rose/rose_build_merge.h b/src/rose/rose_build_merge.h
index 0f765bff..6de6c778 100644
--- a/src/rose/rose_build_merge.h
+++ b/src/rose/rose_build_merge.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,8 +27,8 @@
  */
 
 /** \file
- * \brief Rose Build: functions for reducing the size of the Rose graph
- * through merging.
+ * \brief Rose Build: functions for reducing the number of engines in a Rose
+ * graph through merging or deduplicating engines.
  */
 
 #ifndef ROSE_BUILD_MERGE_H
@@ -44,9 +44,6 @@ namespace ue2 {
 class NGHolder;
 class RoseBuildImpl;
 
-void mergeDupeLeaves(RoseBuildImpl &tbi);
-void uncalcLeaves(RoseBuildImpl &tbi);
-
 bool dedupeLeftfixes(RoseBuildImpl &tbi);
 void mergeLeftfixesVariableLag(RoseBuildImpl &tbi);
 void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi);
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 01be11ef..a7332df7 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -26,8 +26,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "rose_build_misc.h"
 #include "rose_build_impl.h"
 
+#include "rose_build_resources.h"
 #include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
 #include "nfa/goughcompile.h"
@@ -56,11 +58,9 @@
 #include "ue2common.h"
 #include "grey.h"
 
-#include <boost/functional/hash/hash_fwd.hpp>
 #include <boost/graph/breadth_first_search.hpp>
 
 using namespace std;
-using boost::hash_combine;
 
 namespace ue2 {
 
@@ -576,6 +576,9 @@ bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
     return false;
 }
 
+size_t RoseSuffixInfo::hash() const {
+    return hash_all(top, graph, castle, rdfa, haig, tamarama);
+}
 
 void RoseSuffixInfo::reset(void) {
     top = 0;
@@ -691,16 +694,7 @@ set<u32> all_tops(const suffix_id &s) {
 }
 
 size_t suffix_id::hash() const {
-    size_t val = 0;
-    hash_combine(val, g);
-    hash_combine(val, c);
-    hash_combine(val, d);
-    hash_combine(val, h);
-    return val;
-}
-
-size_t hash_value(const suffix_id &s) {
-    return s.hash();
+    return hash_all(g, c, d, h, t);
 }
 
 bool isAnchored(const left_id &r) {
@@ -756,21 +750,25 @@ set<u32> all_tops(const left_id &r) {
     return {0};
 }
 
+set<u32> all_reports(const left_id &left) {
+    assert(left.graph() || left.castle() || left.haig() || left.dfa());
+    if (left.graph()) {
+        return all_reports(*left.graph());
+    } else if (left.castle()) {
+        return all_reports(*left.castle());
+    } else if (left.dfa()) {
+        return all_reports(*left.dfa());
+    } else {
+        return all_reports(*left.haig());
+    }
+}
+
 u32 num_tops(const left_id &r) {
     return all_tops(r).size();
 }
 
 size_t left_id::hash() const {
-    size_t val = 0;
-    hash_combine(val, g);
-    hash_combine(val, c);
-    hash_combine(val, d);
-    hash_combine(val, h);
-    return val;
-}
-
-size_t hash_value(const left_id &r) {
-    return r.hash();
+    return hash_all(g, c, d, h);
 }
 
 u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) {
@@ -787,6 +785,10 @@ u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) {
     return maxOffset;
 }
 
+size_t LeftEngInfo::hash() const {
+    return hash_all(graph, castle, dfa, haig, tamarama, lag, leftfix_report);
+}
+
 void LeftEngInfo::reset(void) {
     graph.reset();
     castle.reset();
@@ -807,18 +809,16 @@ LeftEngInfo::operator bool() const {
     return graph || castle || dfa || haig;
 }
 
-u32 roseQuality(const RoseEngine *t) {
+u32 roseQuality(const RoseResources &res, const RoseEngine *t) {
     /* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs
      */
-    const anchored_matcher_info *atable = getALiteralMatcher(t);
-    if (atable) {
-        if (atable->next_offset) {
+    if (res.has_anchored) {
+        if (res.has_anchored_multiple) {
             DEBUG_PRINTF("multiple atable engines\n");
             return 0;
         }
-        const NFA *nfa = (const NFA *)((const char *)atable + sizeof(*atable));
 
-        if (!isSmallDfaType(nfa->type)) {
+        if (res.has_anchored_large) {
             DEBUG_PRINTF("m16 atable engine\n");
             return 0;
         }
@@ -827,7 +827,7 @@ u32 roseQuality(const RoseEngine *t) {
     /* if we always run multiple engines then we are slow */
     u32 always_run = 0;
 
-    if (atable) {
+    if (res.has_anchored) {
         always_run++;
     }
 
@@ -836,8 +836,7 @@ u32 roseQuality(const RoseEngine *t) {
         always_run++;
     }
 
-    const HWLM *ftable = getFLiteralMatcher(t);
-    if (ftable) {
+    if (res.has_floating) {
         /* TODO: ignore conditional ftables, or ftables beyond smwr region */
         always_run++;
     }
@@ -997,8 +996,8 @@ bool canImplementGraphs(const RoseBuildImpl &tbi) {
 bool hasOrphanedTops(const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
 
-    ue2::unordered_map<left_id, set<u32> > roses;
-    ue2::unordered_map<suffix_id, set<u32> > suffixes;
+    unordered_map<left_id, set<u32>> roses;
+    unordered_map<suffix_id, set<u32>> suffixes;
 
     for (auto v : vertices_range(g)) {
         if (g[v].left) {
diff --git a/src/rose/rose_build_misc.h b/src/rose/rose_build_misc.h
new file mode 100644
index 00000000..f34b8292
--- /dev/null
+++ b/src/rose/rose_build_misc.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_MISC_H
+#define ROSE_BUILD_MISC_H
+
+#include "ue2common.h"
+
+struct RoseEngine;
+
+namespace ue2 {
+
+struct RoseResources;
+
+/* used by heuristics to determine the small write engine. High numbers are
+ * intended to indicate a lightweight rose. */
+u32 roseQuality(const RoseResources &res, const RoseEngine *rose);
+
+}
+
+#endif
diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp
index 23a8b959..8f350e29 100644
--- a/src/rose/rose_build_program.cpp
+++ b/src/rose/rose_build_program.cpp
@@ -41,6 +41,7 @@
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/report_manager.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
 
 #include <boost/range/adaptor/map.hpp>
@@ -226,7 +227,7 @@ size_t RoseProgramHash::operator()(const RoseProgram &program) const {
     size_t v = 0;
     for (const auto &ri : program) {
         assert(ri);
-        boost::hash_combine(v, ri->hash());
+        hash_combine(v, ri->hash());
     }
     return v;
 }
@@ -1288,19 +1289,28 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id,
 
     vector<LookEntry> look;
 
-    const ue2_literal &s = build.literals.at(lit_id).s;
+    const auto &lit = build.literals.at(lit_id);
+    const ue2_literal &s = lit.s;
+    const auto &msk = lit.msk;
+
     DEBUG_PRINTF("building mask for lit %u: %s\n", lit_id,
                  dumpString(s).c_str());
+
     assert(s.length() <= MAX_MASK2_WIDTH);
-    s32 i = 0 - s.length();
-    for (const auto &e : s) {
-        if (!e.nocase) {
-            look.emplace_back(verify_s8(i), e);
+
+    // Note: the literal matcher will confirm the HWLM mask in lit.msk, so we
+    // do not include those entries in the lookaround.
+    auto it = s.begin();
+    for (s32 i = 0 - s.length(), i_end = 0 - msk.size(); i < i_end; ++i, ++it) {
+        if (!it->nocase) {
+            look.emplace_back(verify_s8(i), *it);
         }
-        i++;
     }
 
-    assert(!look.empty());
+    if (look.empty()) {
+        return; // all caseful chars handled by HWLM mask.
+    }
+
     makeLookaroundInstruction(look, program);
 }
 
@@ -1925,14 +1935,14 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 lit_id,
 
 namespace {
 struct ProgKey {
-    ProgKey(const RoseProgram &p) : prog(&p) { }
+    ProgKey(const RoseProgram &p) : prog(&p) {}
 
     bool operator==(const ProgKey &b) const {
         return RoseProgramEquivalence()(*prog, *b.prog);
     }
 
-    friend size_t hash_value(const ProgKey &a) {
-        return RoseProgramHash()(*a.prog);
+    size_t hash() const {
+        return RoseProgramHash()(*prog);
     }
 private:
     const RoseProgram *prog;
@@ -1945,7 +1955,7 @@ RoseProgram assembleProgramBlocks(vector<RoseProgram> &&blocks_in) {
     vector<RoseProgram> blocks;
     blocks.reserve(blocks_in.size()); /* to ensure stable reference for seen */
 
-    unordered_set<ProgKey> seen;
+    ue2_unordered_set<ProgKey> seen;
     for (auto &block : blocks_in) {
         if (contains(seen, block)) {
             continue;
@@ -2155,6 +2165,14 @@ RoseProgram makeBoundaryProgram(const RoseBuildImpl &build,
     return prog;
 }
 
+void addIncludedJumpProgram(RoseProgram &program, u32 child_offset,
+                            u8 squash) {
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrIncludedJump>(child_offset,
+                                                            squash));
+    program.add_block(move(block));
+}
+
 static
 void addPredBlockSingle(u32 pred_state, RoseProgram &pred_block,
                         RoseProgram &program) {
diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h
index 8758ef64..cc59303f 100644
--- a/src/rose/rose_build_program.h
+++ b/src/rose/rose_build_program.h
@@ -34,8 +34,8 @@
 #include "util/bytecode_ptr.h"
 #include "util/hash.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
 
+#include <unordered_map>
 #include <vector>
 
 #include <boost/range/adaptor/map.hpp>
@@ -168,7 +168,7 @@ struct ProgramBuild : noncopyable {
 
     /** \brief Mapping from vertex to key, for vertices with a
      * CHECK_NOT_HANDLED instruction. */
-    ue2::unordered_map<RoseVertex, u32> handledKeys;
+    std::unordered_map<RoseVertex, u32> handledKeys;
 
     /** \brief Mapping from Rose literal ID to anchored program index. */
     std::map<u32, u32> anchored_programs;
@@ -178,7 +178,7 @@ struct ProgramBuild : noncopyable {
 
     /** \brief Mapping from every vertex to the groups that must be on for that
      * vertex to be reached. */
-    ue2::unordered_map<RoseVertex, rose_group> vertex_group_map;
+    std::unordered_map<RoseVertex, rose_group> vertex_group_map;
 
     /** \brief Global bitmap of groups that can be squashed. */
     rose_group squashable_groups = 0;
@@ -239,13 +239,13 @@ struct engine_info {
 RoseProgram assembleProgramBlocks(std::vector<RoseProgram> &&blocks);
 
 RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
-                      const std::map<RoseVertex, left_build_info> &leftfix_info,
-                      const std::map<suffix_id, u32> &suffixes,
-                      const std::map<u32, engine_info> &engine_info_by_queue,
-                      const unordered_map<RoseVertex, u32> &roleStateIndices,
-                      ProgramBuild &prog_build, u32 lit_id,
-                      const std::vector<RoseEdge> &lit_edges,
-                      bool is_anchored_replay_program);
+                    const std::map<RoseVertex, left_build_info> &leftfix_info,
+                    const std::map<suffix_id, u32> &suffixes,
+                    const std::map<u32, engine_info> &engine_info_by_queue,
+                    const std::unordered_map<RoseVertex, u32> &roleStateIndices,
+                    ProgramBuild &prog_build, u32 lit_id,
+                    const std::vector<RoseEdge> &lit_edges,
+                    bool is_anchored_replay_program);
 
 RoseProgram makeDelayRebuildProgram(const RoseBuildImpl &build,
                                     ProgramBuild &prog_build,
@@ -282,6 +282,7 @@ void recordLongLiterals(std::vector<ue2_case_string> &longLiterals,
 
 void recordResources(RoseResources &resources, const RoseProgram &program);
 
+void addIncludedJumpProgram(RoseProgram &program, u32 child_offset, u8 squash);
 } // namespace ue2
 
 #endif // ROSE_BUILD_PROGRAM_H
diff --git a/src/rose/rose_build_resources.h b/src/rose/rose_build_resources.h
index 3edb81b9..4fa102f3 100644
--- a/src/rose/rose_build_resources.h
+++ b/src/rose/rose_build_resources.h
@@ -48,6 +48,8 @@ struct RoseResources {
     bool has_lit_delay = false;
     bool has_lit_check = false; // long literal support
     bool has_anchored = false;
+    bool has_anchored_multiple = false; /* multiple anchored dfas */
+    bool has_anchored_large = false; /* mcclellan 16 anchored dfa */
     bool has_floating = false;
     bool has_eod = false;
 };
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 0e78ec7d..359550e1 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -45,16 +45,15 @@
 #include "util/bitutils.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/hash.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <numeric>
 #include <vector>
-#include <boost/functional/hash/hash.hpp>
 #include <boost/graph/adjacency_iterator.hpp>
 #include <boost/range/adaptor/map.hpp>
 
@@ -63,6 +62,8 @@ using boost::adaptors::map_values;
 
 namespace ue2 {
 
+static constexpr size_t MERGE_GROUP_SIZE_MAX = 200;
+
 namespace {
 // Used for checking edge sets (both in- and out-) against each other.
 struct EdgeAndVertex {
@@ -154,7 +155,7 @@ public:
 private:
     /* if a vertex is worth storing, it is worth storing twice */
     set<RoseVertex> main_cont; /* deterministic iterator */
-    ue2::unordered_set<RoseVertex> hash_cont; /* member checks */
+    unordered_set<RoseVertex> hash_cont; /* member checks */
 };
 
 struct RoseAliasingInfo {
@@ -175,10 +176,10 @@ struct RoseAliasingInfo {
     }
 
     /** \brief Mapping from leftfix to vertices. */
-    ue2::unordered_map<left_id, set<RoseVertex>> rev_leftfix;
+    unordered_map<left_id, set<RoseVertex>> rev_leftfix;
 
     /** \brief Mapping from undelayed ghost to delayed vertices. */
-    ue2::unordered_map<RoseVertex, set<RoseVertex>> rev_ghost;
+    unordered_map<RoseVertex, set<RoseVertex>> rev_ghost;
 };
 
 } // namespace
@@ -771,9 +772,13 @@ void pruneCastle(CastleProto &castle, ReportID report) {
 /** \brief Set all reports to the given one. */
 static
 void setReports(CastleProto &castle, ReportID report) {
-    for (auto &repeat : castle.repeats | map_values) {
+    castle.report_map.clear();
+    for (auto &e : castle.repeats) {
+        u32 top = e.first;
+        auto &repeat = e.second;
         repeat.reports.clear();
         repeat.reports.insert(report);
+        castle.report_map[report].insert(top);
     }
 }
 
@@ -787,7 +792,7 @@ void updateEdgeTops(RoseGraph &g, RoseVertex v, const map<u32, u32> &top_map) {
 static
 void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
                      const set<RoseVertex> &verts) {
-    ue2::unordered_set<u32> used_tops;
+    unordered_set<u32> used_tops;
     for (auto v : verts) {
         assert(g[v].left.castle.get() == &castle);
 
@@ -818,7 +823,7 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
     }
     assert(isCorrectlyTopped(h));
     DEBUG_PRINTF("pruning unused tops\n");
-    ue2::flat_set<u32> used_tops;
+    flat_set<u32> used_tops;
     for (auto v : verts) {
         assert(g[v].left.graph.get() == &h);
 
@@ -1415,7 +1420,7 @@ void removeSingletonBuckets(vector<vector<RoseVertex>> &buckets) {
 
 static
 void buildInvBucketMap(const vector<vector<RoseVertex>> &buckets,
-                       ue2::unordered_map<RoseVertex, size_t> &inv) {
+                       unordered_map<RoseVertex, size_t> &inv) {
     inv.clear();
     for (size_t i = 0; i < buckets.size(); i++) {
         for (auto v : buckets[i]) {
@@ -1483,14 +1488,15 @@ void splitByLiteralTable(const RoseBuildImpl &build,
     auto make_split_key = [&](RoseVertex v) {
         const auto &lits = g[v].literals;
         assert(!lits.empty());
-        return build.literals.at(*lits.begin()).table;
+        auto table = build.literals.at(*lits.begin()).table;
+        return std::underlying_type<decltype(table)>::type(table);
     };
     splitAndFilterBuckets(buckets, make_split_key);
 }
 
 static
 void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
-                      ue2::unordered_map<RoseVertex, size_t> &inv, bool succ) {
+                      unordered_map<RoseVertex, size_t> &inv, bool succ) {
     vector<vector<RoseVertex>> extras;
     map<size_t, vector<RoseVertex>> neighbours_by_bucket;
     set<RoseVertex> picked;
@@ -1575,7 +1581,7 @@ splitDiamondMergeBuckets(CandidateSet &candidates, const RoseBuildImpl &build) {
     }
 
     // Neighbour splits require inverse map.
-    ue2::unordered_map<RoseVertex, size_t> inv;
+    unordered_map<RoseVertex, size_t> inv;
     buildInvBucketMap(buckets, inv);
 
     splitByNeighbour(g, buckets, inv, true);
@@ -2026,4 +2032,304 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
     assert(canImplementGraphs(build));
 }
 
+namespace {
+struct DupeLeafKey {
+    explicit DupeLeafKey(const RoseVertexProps &litv)
+        : literals(litv.literals), reports(litv.reports),
+          eod_accept(litv.eod_accept), suffix(litv.suffix), left(litv.left),
+          som_adjust(litv.som_adjust) {
+        DEBUG_PRINTF("eod_accept %d\n", (int)eod_accept);
+        DEBUG_PRINTF("report %u\n", left.leftfix_report);
+        DEBUG_PRINTF("lag %u\n", left.lag);
+    }
+
+    bool operator<(const DupeLeafKey &b) const {
+        const DupeLeafKey &a = *this;
+        ORDER_CHECK(literals);
+        ORDER_CHECK(eod_accept);
+        ORDER_CHECK(suffix);
+        ORDER_CHECK(reports);
+        ORDER_CHECK(som_adjust);
+        ORDER_CHECK(left.leftfix_report);
+        ORDER_CHECK(left.lag);
+        return false;
+    }
+
+    flat_set<u32> literals;
+    flat_set<ReportID> reports;
+    bool eod_accept;
+    suffix_id suffix;
+    LeftEngInfo left;
+    u32 som_adjust;
+};
+
+struct UncalcLeafKey {
+    UncalcLeafKey(const RoseGraph &g, RoseVertex v)
+        : literals(g[v].literals), rose(g[v].left) {
+        for (const auto &e : in_edges_range(v, g)) {
+            RoseVertex u = source(e, g);
+            preds.insert(make_pair(u, g[e]));
+        }
+    }
+
+    bool operator<(const UncalcLeafKey &b) const {
+        const UncalcLeafKey &a = *this;
+        ORDER_CHECK(literals);
+        ORDER_CHECK(preds);
+        ORDER_CHECK(rose);
+        return false;
+    }
+
+    flat_set<u32> literals;
+    flat_set<pair<RoseVertex, RoseEdgeProps>> preds;
+    LeftEngInfo rose;
+};
+} // namespace
+
+/**
+ * This function merges leaf vertices with the same literals and report
+ * id/suffix. The leaf vertices of the graph are inspected and a mapping of
+ * leaf vertex properties to vertices is built. If the same set of leaf
+ * properties has already been seen when we inspect a vertex, we attempt to
+ * merge the vertex in with the previously seen vertex. This process can fail
+ * if the vertices share a common predecessor vertex but have a differing,
+ * incompatible relationship (different bounds or infix) with the predecessor.
+ *
+ * This takes place after \ref dedupeSuffixes to increase effectiveness as the
+ * same suffix is required for a merge to occur.
+ *
+ * TODO: work if this is a subset of role aliasing (and if it can be eliminated)
+ * or clearly document cases that would not be covered by role aliasing.
+ */
+void mergeDupeLeaves(RoseBuildImpl &build) {
+    map<DupeLeafKey, RoseVertex> leaves;
+    vector<RoseVertex> changed;
+
+    RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (in_degree(v, g) == 0) {
+            assert(build.isAnyStart(v));
+            continue;
+        }
+
+        DEBUG_PRINTF("inspecting vertex index=%zu in_degree %zu "
+                     "out_degree %zu\n", g[v].index, in_degree(v, g),
+                     out_degree(v, g));
+
+        // Vertex must be a reporting leaf node
+        if (g[v].reports.empty() || !isLeafNode(v, g)) {
+            continue;
+        }
+
+        // At the moment, we ignore all successors of root or anchored_root,
+        // since many parts of our runtime assume that these have in-degree 1.
+        if (build.isRootSuccessor(v)) {
+            continue;
+        }
+
+        DupeLeafKey dupe(g[v]);
+        if (leaves.find(dupe) == leaves.end()) {
+            leaves.insert(make_pair(dupe, v));
+            continue;
+        }
+
+        RoseVertex t = leaves.find(dupe)->second;
+        DEBUG_PRINTF("found two leaf dupe roles, index=%zu,%zu\n", g[v].index,
+                     g[t].index);
+
+        vector<RoseEdge> deadEdges;
+        for (const auto &e : in_edges_range(v, g)) {
+            RoseVertex u = source(e, g);
+            DEBUG_PRINTF("u index=%zu\n", g[u].index);
+            if (RoseEdge et = edge(u, t, g)) {
+                if (g[et].minBound <= g[e].minBound
+                    && g[et].maxBound >= g[e].maxBound) {
+                    DEBUG_PRINTF("remove more constrained edge\n");
+                    deadEdges.push_back(e);
+                }
+            } else {
+                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].index,
+                             g[t].index);
+                add_edge(u, t, g[e], g);
+                deadEdges.push_back(e);
+            }
+        }
+
+        if (!deadEdges.empty()) {
+            for (auto &e : deadEdges) {
+                remove_edge(e, g);
+            }
+            changed.push_back(v);
+            g[t].min_offset = min(g[t].min_offset, g[v].min_offset);
+            g[t].max_offset = max(g[t].max_offset, g[v].max_offset);
+        }
+    }
+    DEBUG_PRINTF("find loop done\n");
+
+    // Remove any vertices that now have no in-edges.
+    size_t countRemovals = 0;
+    for (size_t i = 0; i < changed.size(); i++) {
+        RoseVertex v = changed[i];
+        if (in_degree(v, g) == 0) {
+            DEBUG_PRINTF("remove vertex\n");
+            if (!build.isVirtualVertex(v)) {
+                for (u32 lit_id : g[v].literals) {
+                    build.literal_info[lit_id].vertices.erase(v);
+                }
+            }
+            remove_vertex(v, g);
+            countRemovals++;
+        }
+    }
+
+    // if we've removed anything, we need to renumber vertices
+    if (countRemovals) {
+        renumber_vertices(g);
+        DEBUG_PRINTF("removed %zu vertices.\n", countRemovals);
+    }
+}
+
+/** Merges the suffixes on the (identical) vertices in \a vcluster, used by
+ * \ref uncalcLeaves. */
+static
+void mergeCluster(RoseGraph &g, const ReportManager &rm,
+                  const vector<RoseVertex> &vcluster,
+                  vector<RoseVertex> &dead, const CompileContext &cc) {
+    if (vcluster.size() <= 1) {
+        return; // No merge to perform.
+    }
+
+    // Note that we batch merges up fairly crudely for performance reasons.
+    vector<RoseVertex>::const_iterator it = vcluster.begin(), it2;
+    while (it != vcluster.end()) {
+        vector<NGHolder *> cluster;
+        map<NGHolder *, RoseVertex> rev;
+
+        for (it2 = it;
+             it2 != vcluster.end() && cluster.size() < MERGE_GROUP_SIZE_MAX;
+             ++it2) {
+            RoseVertex v = *it2;
+            NGHolder *h = g[v].suffix.graph.get();
+            assert(!g[v].suffix.haig); /* should not be here if haig */
+            rev[h] = v;
+            cluster.push_back(h);
+        }
+        it = it2;
+
+        DEBUG_PRINTF("merging cluster %zu\n", cluster.size());
+        auto merged = mergeNfaCluster(cluster, &rm, cc);
+        DEBUG_PRINTF("done\n");
+
+        for (const auto &m : merged) {
+            NGHolder *h_victim = m.first; // mergee
+            NGHolder *h_winner = m.second;
+            RoseVertex victim = rev[h_victim];
+            RoseVertex winner = rev[h_winner];
+
+            LIMIT_TO_AT_MOST(&g[winner].min_offset, g[victim].min_offset);
+            ENSURE_AT_LEAST(&g[winner].max_offset, g[victim].max_offset);
+            insert(&g[winner].reports, g[victim].reports);
+
+            dead.push_back(victim);
+        }
+    }
+}
+
+static
+void findUncalcLeavesCandidates(RoseBuildImpl &build,
+                           map<UncalcLeafKey, vector<RoseVertex> > &clusters,
+                           deque<UncalcLeafKey> &ordered) {
+    const RoseGraph &g = build.g;
+
+    vector<RoseVertex> suffix_vertices; // vertices with suffix graphs
+    unordered_map<const NGHolder *, u32> fcount; // ref count per graph
+
+    for (auto v : vertices_range(g)) {
+        if (g[v].suffix) {
+            if (!g[v].suffix.graph) {
+                continue; /* cannot uncalc (haig/mcclellan); TODO */
+            }
+
+            assert(g[v].suffix.graph->kind == NFA_SUFFIX);
+
+            // Ref count all suffixes, as we don't want to merge a suffix
+            // that happens to be shared with a non-leaf vertex somewhere.
+            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].index,
+                         g[v].suffix.graph.get());
+            fcount[g[v].suffix.graph.get()]++;
+
+            // Vertex must be a reporting pseudo accept
+            if (!isLeafNode(v, g)) {
+                continue;
+            }
+
+            suffix_vertices.push_back(v);
+        }
+    }
+
+    for (auto v : suffix_vertices) {
+        if (in_degree(v, g) == 0) {
+            assert(build.isAnyStart(v));
+            continue;
+        }
+
+        const NGHolder *h = g[v].suffix.graph.get();
+        assert(h);
+        DEBUG_PRINTF("suffix %p\n", h);
+
+        // We can't easily merge suffixes shared with other vertices, and
+        // creating a unique copy to do so may just mean we end up tracking
+        // more NFAs. Better to leave shared suffixes alone.
+        if (fcount[h] != 1) {
+            DEBUG_PRINTF("skipping shared suffix\n");
+            continue;
+        }
+
+        UncalcLeafKey key(g, v);
+        vector<RoseVertex> &vec = clusters[key];
+        if (vec.empty()) {
+
+            ordered.push_back(key);
+        }
+        vec.push_back(v);
+    }
+
+    DEBUG_PRINTF("find loop done\n");
+}
+
+/**
+ * This function attempts to combine identical roles (same literals, same
+ * predecessors, etc) with different suffixes into a single role which
+ * activates a larger suffix. The leaf vertices of the graph with a suffix are
+ * grouped into clusters which have members triggered by identical roles. The
+ * \ref mergeNfaCluster function (from ng_uncalc_components) is then utilised
+ * to build a set of larger (and still implementable) suffixes. The graph is
+ * then updated to point to the new suffixes and any unneeded roles are
+ * removed.
+ *
+ * Note: suffixes which are shared amongst multiple roles are not considered
+ * for this pass as the individual suffixes would have to continue to exist for
+ * the other roles to trigger resulting in the transformation not producing any
+ * savings.
+ *
+ * Note: as \ref mergeNfaCluster is slow when the cluster sizes are large,
+ * clusters of more than \ref MERGE_GROUP_SIZE_MAX roles are split into smaller
+ * chunks for processing.
+ */
+void uncalcLeaves(RoseBuildImpl &build) {
+    DEBUG_PRINTF("uncalcing\n");
+
+    map<UncalcLeafKey, vector<RoseVertex> > clusters;
+    deque<UncalcLeafKey> ordered;
+    findUncalcLeavesCandidates(build, clusters, ordered);
+
+    vector<RoseVertex> dead;
+
+    for (const auto &key : ordered) {
+        DEBUG_PRINTF("cluster of size %zu\n", clusters[key].size());
+        mergeCluster(build.g, build.rm, clusters[key], dead, build.cc);
+    }
+    build.removeVertices(dead);
+}
+
 } // namespace ue2
diff --git a/src/rose/rose_build_role_aliasing.h b/src/rose/rose_build_role_aliasing.h
index 274b76f9..4655f10d 100644
--- a/src/rose/rose_build_role_aliasing.h
+++ b/src/rose/rose_build_role_aliasing.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,8 +26,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef ROSE_BUILD_ROLE_ALIASING
-#define ROSE_BUILD_ROLE_ALIASING
+#ifndef ROSE_BUILD_ROLE_ALIASING_H
+#define ROSE_BUILD_ROLE_ALIASING_H
+
+/** \file
+ * \brief Rose Build: functions for reducing the size of the Rose graph
+ * through merging roles (RoseVertices) together.
+ */
 
 namespace ue2 {
 
@@ -35,6 +40,9 @@ class RoseBuildImpl;
 
 void aliasRoles(RoseBuildImpl &build, bool mergeRoses);
 
+void mergeDupeLeaves(RoseBuildImpl &build);
+void uncalcLeaves(RoseBuildImpl &build);
+
 } // namespace ue2
 
 #endif
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index b7e092bb..2c5ebbe9 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -43,7 +43,7 @@
 #include "nfa/nfa_internal.h" // for MO_INVALID_IDX
 #include "util/charreach.h"
 #include "util/depth.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/ue2_graph.h"
 
 #include <memory>
@@ -111,6 +111,7 @@ struct LeftEngInfo {
         ORDER_CHECK(leftfix_report);
         return false;
     }
+    size_t hash() const;
     void reset(void);
     operator bool() const;
     bool tracksSom() const { return !!haig; }
@@ -131,6 +132,7 @@ struct RoseSuffixInfo {
     bool operator==(const RoseSuffixInfo &b) const;
     bool operator!=(const RoseSuffixInfo &b) const { return !(*this == b); }
     bool operator<(const RoseSuffixInfo &b) const;
+    size_t hash() const;
     void reset(void);
     operator bool() const { return graph || castle || haig || rdfa || tamarama; }
 };
diff --git a/src/rose/rose_in_dump.cpp b/src/rose/rose_in_dump.cpp
index 172b58e8..5266e9d7 100644
--- a/src/rose/rose_in_dump.cpp
+++ b/src/rose/rose_in_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,7 @@
 #include "nfagraph/ng_dump.h"
 #include "nfagraph/ng_util.h"
 #include "util/container.h"
+#include "util/dump_util.h"
 #include "util/graph_range.h"
 
 #include <cstdio>
@@ -59,7 +60,7 @@ void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
         filename = "pre_rose.dot";
     }
     DEBUG_PRINTF("dumping rose graphs\n");
-    FILE *f = fopen((grey.dumpPath + filename).c_str(), "w");
+    StdioFile f(grey.dumpPath + filename, "w");
     fprintf(f, "digraph NFA {\n");
     fprintf(f, "rankdir=LR;\n");
     fprintf(f, "size=\"11.5,8\"\n");
@@ -127,7 +128,6 @@ void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
     }
 
     fprintf(f, "}\n");
-    fclose(f);
 }
 
 }
diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h
index 42c59932..ed4644ae 100644
--- a/src/rose/rose_in_graph.h
+++ b/src/rose/rose_in_graph.h
@@ -45,7 +45,7 @@
 
 #include "ue2common.h"
 #include "rose/rose_common.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/ue2_graph.h"
 #include "util/ue2string.h"
 
diff --git a/src/rose/rose_in_util.cpp b/src/rose/rose_in_util.cpp
index 3b31b38e..9fe47c27 100644
--- a/src/rose/rose_in_util.cpp
+++ b/src/rose/rose_in_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@
 #include "util/container.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
 
 #include <vector>
 
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 57395c9d..d38ee8c0 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -170,6 +170,12 @@ struct NfaInfo {
 #define OWB_ZOMBIE_ALWAYS_YES 128 /* nfa will always answer yes to any rose
                                    * prefix checks */
 
+/* offset of the status flags in the stream state. */
+#define ROSE_STATE_OFFSET_STATUS_FLAGS 0
+
+/* offset of role mmbit in stream state (just after the status flag byte). */
+#define ROSE_STATE_OFFSET_ROLE_MMBIT   sizeof(u8)
+
 /**
  * \brief Rose state offsets.
  *
@@ -184,24 +190,28 @@ struct NfaInfo {
 struct RoseStateOffsets {
     /** History buffer.
      *
-     * First byte is an 8-bit count of the number of valid history bytes
-     * available, followed by the history itself. Max size of history is
-     * RoseEngine::historyRequired. */
+     * Max size of history is RoseEngine::historyRequired. */
     u32 history;
 
-    /** Exhausted bitvector.
+    /** Exhausted multibit.
      *
-     * 1 bit per exhaustible key (used by Highlander mode). If a bit is set,
+     * entry per exhaustible key (used by Highlander mode). If a bit is set,
      * reports with that ekey should not be delivered to the user. */
     u32 exhausted;
 
+    /** size of exhausted multibit */
+    u32 exhausted_size;
+
     /** Multibit for active suffix/outfix engines. */
     u32 activeLeafArray;
 
-    /** Multibit for active Rose (prefix/infix) engines. */
+    /** Size of multibit for active suffix/outfix engines in bytes. */
+    u32 activeLeafArray_size;
+
+    /** Multibit for active leftfix (prefix/infix) engines. */
     u32 activeLeftArray;
 
-    /** Size of the active Rose array multibit, in bytes. */
+    /** Size of multibit for active leftfix (prefix/infix) engines in bytes. */
     u32 activeLeftArray_size;
 
     /** Table of lag information (stored as one byte per engine) for active
@@ -220,6 +230,9 @@ struct RoseStateOffsets {
     /** State for long literal support. */
     u32 longLitState;
 
+    /** Size of the long literal state. */
+    u32 longLitState_size;
+
     /** Packed SOM location slots. */
     u32 somLocation;
 
@@ -229,6 +242,13 @@ struct RoseStateOffsets {
     /** Multibit guarding SOM location slots. */
     u32 somWritable;
 
+    /** Size of each of the somValid and somWritable multibits, in bytes. */
+    u32 somMultibit_size;
+
+    /** Begin of the region where NFA engine state is stored.
+     * The NFA state region extends to end. */
+    u32 nfaStateBegin;
+
     /** Total size of Rose state, in bytes. */
     u32 end;
 };
@@ -317,7 +337,6 @@ struct RoseEngine {
     u32 stateSize; /* size of the state bitset
                     * WARNING: not the size of the rose state */
     u32 anchorStateSize; /* size of the state for the anchor dfas */
-    u32 nfaStateSize; /* total size of the state for the mask/rose nfas */
     u32 tStateSize; /* total size of the state for transient rose nfas */
     u32 scratchStateSize; /**< uncompressed state req'd for NFAs in scratch;
                            * used for sizing scratch only. */
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 78b123d5..eeebfed1 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -178,7 +178,12 @@ enum RoseInstructionCode {
      */
     ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
 
-    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64 //!< Sentinel.
+    /**
+     * \brief Jump to the program of included literal.
+     */
+    ROSE_INSTR_INCLUDED_JUMP,
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_INCLUDED_JUMP //!< Sentinel.
 };
 
 struct ROSE_STRUCT_END {
@@ -625,4 +630,10 @@ struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 {
     s32 last_start; //!< The latest start offset among 8 paths.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
+
+struct ROSE_STRUCT_INCLUDED_JUMP {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 squash; //!< FDR confirm squash mask for included literal.
+    u32 child_offset; //!< Program offset of included literal.
+};
 #endif // ROSE_ROSE_PROGRAM_H
diff --git a/src/rose/runtime.h b/src/rose/runtime.h
index d2a4b5d7..88342b53 100644
--- a/src/rose/runtime.h
+++ b/src/rose/runtime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -68,7 +68,7 @@ const void *getByOffset(const struct RoseEngine *t, u32 offset) {
 
 static really_inline
 void *getRoleState(char *state) {
-    return state + sizeof(u8); // status flags
+    return state + ROSE_STATE_OFFSET_ROLE_MMBIT;
 }
 
 /** \brief Fetch the active array for suffix nfas. */
diff --git a/src/rose/stream.c b/src/rose/stream.c
index c68cd8ab..d667ae56 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -659,8 +659,8 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         DEBUG_PRINTF("start=%zu\n", start);
 
         DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
-        hwlmExecStreaming(ftable, scratch, flen, start, roseFloatingCallback,
-                          scratch, tctxt->groups & t->floating_group_mask);
+        hwlmExecStreaming(ftable, flen, start, roseFloatingCallback, scratch,
+                          tctxt->groups & t->floating_group_mask);
     }
 
 flush_delay_and_exit:
@@ -742,11 +742,9 @@ void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
     assert(!scratch->tctxt.filledDelayedSlots);
 
     const u64a som = 0;
-    const size_t match_len = 0;
     const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     // Note: we ignore the result, as this is the last thing to ever happen on
     // a scan.
-    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                   flags);
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags);
 }
diff --git a/src/runtime.c b/src/runtime.c
index 5725cf93..c384c031 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -53,6 +53,7 @@
 #include "som/som_runtime.h"
 #include "som/som_stream.h"
 #include "state.h"
+#include "stream_compress.h"
 #include "ue2common.h"
 #include "util/exhaust.h"
 #include "util/multibit.h"
@@ -139,6 +140,7 @@ void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose,
     s->som_set_now_offset = ~0ULL;
     s->deduper.current_report_offset = ~0ULL;
     s->deduper.som_log_dirty = 1; /* som logs have not been cleared */
+    s->fdr_conf = NULL;
 
     // Rose program execution (used for some report paths) depends on these
     // values being initialised.
@@ -153,7 +155,7 @@ void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose,
 /** \brief Retrieve status bitmask from stream state. */
 static really_inline
 u8 getStreamStatus(const char *state) {
-    u8 status = *(const u8 *)state;
+    u8 status = *(const u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS);
     assert((status & ~STATUS_VALID_BITS) == 0);
     return status;
 }
@@ -162,7 +164,7 @@ u8 getStreamStatus(const char *state) {
 static really_inline
 void setStreamStatus(char *state, u8 status) {
     assert((status & ~STATUS_VALID_BITS) == 0);
-    *(u8 *)state = status;
+    *(u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS) = status;
 }
 
 /** \brief Initialise SOM state. Used in both block and streaming mode. */
@@ -764,7 +766,7 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     // start the match region at zero.
     const size_t start = 0;
 
-    hwlmExecStreaming(ftable, scratch, len2, start, roseCallback, scratch,
+    hwlmExecStreaming(ftable, len2, start, roseCallback, scratch,
                       rose->initialGroups & rose->floating_group_mask);
 
     if (!told_to_stop_matching(scratch) &&
@@ -1092,3 +1094,100 @@ hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
 
     return HS_SUCCESS;
 }
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf,
+                                       size_t buf_space, size_t *used_space) {
+    if (unlikely(!stream || !used_space)) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(buf_space && !buf)) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = stream->rose;
+
+    size_t stream_size = size_compress_stream(rose, stream);
+
+    DEBUG_PRINTF("require %zu [orig %zu]\n", stream_size,
+                 rose->stateOffsets.end + sizeof(struct hs_stream));
+    *used_space = stream_size;
+
+    if (buf_space < stream_size) {
+        return HS_INSUFFICIENT_SPACE;
+    }
+    compress_stream(buf, stream_size, rose, stream);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db,
+                                     hs_stream_t **stream,
+                                     const char *buf, size_t buf_size) {
+    if (unlikely(!stream || !buf)) {
+        return HS_INVALID;
+    }
+
+    *stream = NULL;
+
+    hs_error_t err = validDatabase(db);
+    if (unlikely(err != HS_SUCCESS)) {
+        return err;
+    }
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+    if (unlikely(!ISALIGNED_16(rose))) {
+        return HS_INVALID;
+    }
+
+    if (unlikely(rose->mode != HS_MODE_STREAM)) {
+        return HS_DB_MODE_ERROR;
+    }
+
+    size_t stream_size = rose->stateOffsets.end + sizeof(struct hs_stream);
+
+    struct hs_stream *s = hs_stream_alloc(stream_size);
+    if (unlikely(!s)) {
+        return HS_NOMEM;
+    }
+
+    if (!expand_stream(s, rose, buf, buf_size)) {
+        hs_stream_free(s);
+        return HS_INVALID;
+    }
+
+    *stream = s;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream,
+                                               const char *buf, size_t buf_size,
+                                               hs_scratch_t *scratch,
+                                               match_event_handler onEvent,
+                                               void *context) {
+    if (unlikely(!to_stream || !buf)) {
+        return HS_INVALID;
+    }
+
+    const struct RoseEngine *rose = to_stream->rose;
+
+    if (onEvent) {
+        if (!scratch || !validScratch(to_stream->rose, scratch)) {
+            return HS_INVALID;
+        }
+        if (unlikely(markScratchInUse(scratch))) {
+            return HS_SCRATCH_IN_USE;
+        }
+        report_eod_matches(to_stream, scratch, onEvent, context);
+        unmarkScratchInUse(scratch);
+    }
+
+    if (expand_stream(to_stream, rose, buf, buf_size)) {
+        return HS_SUCCESS;
+    } else {
+        return HS_INVALID;
+    }
+}
diff --git a/src/scratch.c b/src/scratch.c
index 84d23ced..8e082c77 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -136,6 +136,7 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     s->in_use = 0;
     s->scratchSize = alloc_size;
     s->scratch_alloc = (char *)s_tmp;
+    s->fdr_conf = NULL;
 
     // each of these is at an offset from the previous
     char *current = (char *)s + sizeof(*s);
diff --git a/src/scratch.h b/src/scratch.h
index 47f8afa8..fa998e84 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -45,7 +45,6 @@ extern "C"
 #endif
 
 UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259;
-#define FDR_TEMP_BUF_SIZE 222
 
 struct fatbit;
 struct hs_scratch;
@@ -201,7 +200,9 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
     u32 delay_fatbit_size; /**< size of each delay fatbit in bytes */
     u32 scratchSize;
     char *scratch_alloc; /* user allocated scratch object */
-    u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
+    u64a *fdr_conf; /**< FDR confirm value */
+    u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches
+                         * in buffer */
 };
 
 /* array of fatbit ptr; TODO: why not an array of fatbits? */
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index bb933cbe..345edfe9 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -132,12 +132,10 @@ public:
 
     set<ReportID> all_reports() const override;
 
-    bool determiniseLiterals();
-
     const ReportManager &rm;
     const CompileContext &cc;
 
-    unique_ptr<raw_dfa> rdfa;
+    vector<unique_ptr<raw_dfa>> dfas;
     LitTrie lit_trie;
     LitTrie lit_trie_nocase;
     size_t num_literals = 0;
@@ -226,6 +224,40 @@ bool pruneOverlong(NGHolder &g, const depth &max_depth,
     return modified;
 }
 
+/**
+ * \brief Attempt to merge the set of DFAs given down into a single raw_dfa.
+ * Returns false on failure.
+ */
+static
+bool mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, const ReportManager &rm,
+               const CompileContext &cc) {
+    assert(!dfas.empty());
+
+    if (dfas.size() == 1) {
+        return true;
+    }
+
+    DEBUG_PRINTF("attempting to merge %zu DFAs\n", dfas.size());
+
+    vector<const raw_dfa *> dfa_ptrs;
+    dfa_ptrs.reserve(dfas.size());
+    for (auto &d : dfas) {
+        dfa_ptrs.push_back(d.get());
+    }
+
+    auto merged = mergeAllDfas(dfa_ptrs, DFA_MERGE_MAX_STATES, &rm, cc.grey);
+    if (!merged) {
+        DEBUG_PRINTF("merge failed\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("merge succeeded, result has %zu states\n",
+                  merged->states.size());
+    dfas.clear();
+    dfas.push_back(std::move(merged));
+    return true;
+}
+
 void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
     // If the graph is poisoned (i.e. we can't build a SmallWrite version),
     // we don't even try.
@@ -283,19 +315,14 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
         minimize_hopcroft(*r, cc.grey);
     }
 
-    if (rdfa) {
-        // do a merge of the new dfa with the existing dfa
-        auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES,
-                                   &rm, cc.grey);
-        if (!merged) {
-            DEBUG_PRINTF("merge failed\n");
+    dfas.push_back(std::move(r));
+
+    if (dfas.size() >= cc.grey.smallWriteMergeBatchSize) {
+        if (!mergeDfas(dfas, rm, cc)) {
+            dfas.clear();
             poisoned = true;
             return;
         }
-        DEBUG_PRINTF("merge succeeded, built %p\n", merged.get());
-        rdfa = move(merged);
-    } else {
-        rdfa = move(r);
     }
 }
 
@@ -368,7 +395,7 @@ namespace {
  */
 struct ACVisitor : public boost::default_bfs_visitor {
     ACVisitor(LitTrie &trie_in,
-              map<LitTrieVertex, LitTrieVertex> &failure_map_in,
+              unordered_map<LitTrieVertex, LitTrieVertex> &failure_map_in,
               vector<LitTrieVertex> &ordering_in)
         : mutable_trie(trie_in), failure_map(failure_map_in),
           ordering(ordering_in) {}
@@ -418,7 +445,7 @@ struct ACVisitor : public boost::default_bfs_visitor {
 
 private:
     LitTrie &mutable_trie; //!< For setting reports property.
-    map<LitTrieVertex, LitTrieVertex> &failure_map;
+    unordered_map<LitTrieVertex, LitTrieVertex> &failure_map;
     vector<LitTrieVertex> &ordering; //!< BFS ordering for vertices.
 };
 }
@@ -444,11 +471,13 @@ bool isSaneTrie(const LitTrie &trie) {
  */
 static
 void buildAutomaton(LitTrie &trie,
-                    map<LitTrieVertex, LitTrieVertex> &failure_map,
+                    unordered_map<LitTrieVertex, LitTrieVertex> &failure_map,
                     vector<LitTrieVertex> &ordering) {
     assert(isSaneTrie(trie));
 
     // Find our failure transitions and reports.
+    failure_map.reserve(num_vertices(trie));
+    ordering.reserve(num_vertices(trie));
     ACVisitor ac_vis(trie, failure_map, ordering);
     boost::breadth_first_search(trie, trie.root, visitor(ac_vis));
 
@@ -645,7 +674,7 @@ unique_ptr<raw_dfa> buildDfa(LitTrie &trie, bool nocase) {
     DEBUG_PRINTF("trie has %zu states\n", num_vertices(trie));
 
     vector<LitTrieVertex> ordering;
-    map<LitTrieVertex, LitTrieVertex> failure_map;
+    unordered_map<LitTrieVertex, LitTrieVertex> failure_map;
     buildAutomaton(trie, failure_map, ordering);
 
     // Construct DFA states in BFS order.
@@ -710,64 +739,6 @@ unique_ptr<raw_dfa> buildDfa(LitTrie &trie, bool nocase) {
     return rdfa;
 }
 
-bool SmallWriteBuildImpl::determiniseLiterals() {
-    DEBUG_PRINTF("handling literals\n");
-    assert(!poisoned);
-    assert(num_literals <= cc.grey.smallWriteMaxLiterals);
-
-    if (is_empty(lit_trie) && is_empty(lit_trie_nocase)) {
-        DEBUG_PRINTF("no literals\n");
-        return true; /* nothing to do */
-    }
-
-    vector<unique_ptr<raw_dfa>> dfas;
-
-    if (!is_empty(lit_trie)) {
-        dfas.push_back(buildDfa(lit_trie, false));
-        DEBUG_PRINTF("caseful literal dfa with %zu states\n",
-                     dfas.back()->states.size());
-    }
-    if (!is_empty(lit_trie_nocase)) {
-        dfas.push_back(buildDfa(lit_trie_nocase, true));
-        DEBUG_PRINTF("nocase literal dfa with %zu states\n",
-                     dfas.back()->states.size());
-    }
-
-    if (rdfa) {
-        dfas.push_back(move(rdfa));
-        DEBUG_PRINTF("general dfa with %zu states\n",
-                     dfas.back()->states.size());
-    }
-
-    // If we only have one DFA, no merging is necessary.
-    if (dfas.size() == 1) {
-        DEBUG_PRINTF("only one dfa\n");
-        rdfa = move(dfas.front());
-        return true;
-    }
-
-    // Merge all DFAs.
-    vector<const raw_dfa *> to_merge;
-    for (const auto &d : dfas) {
-        to_merge.push_back(d.get());
-    }
-
-    auto merged = mergeAllDfas(to_merge, DFA_MERGE_MAX_STATES, &rm, cc.grey);
-
-    if (!merged) {
-        DEBUG_PRINTF("merge failed\n");
-        poisoned = true;
-        return false;
-    }
-
-    DEBUG_PRINTF("merge succeeded, built dfa with %zu states\n",
-                 merged->states.size());
-
-    // Replace our only DFA with the merged one.
-    rdfa = move(merged);
-    return true;
-}
-
 #define MAX_GOOD_ACCEL_DEPTH 4
 
 static
@@ -890,8 +861,8 @@ unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
 
 bytecode_ptr<SmallWriteEngine> SmallWriteBuildImpl::build(u32 roseQuality) {
     const bool has_literals = !is_empty(lit_trie) || !is_empty(lit_trie_nocase);
-    const bool has_non_literals = rdfa != nullptr;
-    if (!rdfa && !has_literals) {
+    const bool has_non_literals = !dfas.empty();
+    if (dfas.empty() && !has_literals) {
         DEBUG_PRINTF("no smallwrite engine\n");
         poisoned = true;
         return nullptr;
@@ -914,16 +885,31 @@ bytecode_ptr<SmallWriteEngine> SmallWriteBuildImpl::build(u32 roseQuality) {
         }
     }
 
-    if (!determiniseLiterals()) {
-        DEBUG_PRINTF("some literal could not be made into a smallwrite dfa\n");
-        return nullptr;
+    if (!is_empty(lit_trie)) {
+        dfas.push_back(buildDfa(lit_trie, false));
+        DEBUG_PRINTF("caseful literal dfa with %zu states\n",
+                     dfas.back()->states.size());
+    }
+    if (!is_empty(lit_trie_nocase)) {
+        dfas.push_back(buildDfa(lit_trie_nocase, true));
+        DEBUG_PRINTF("nocase literal dfa with %zu states\n",
+                     dfas.back()->states.size());
     }
 
-    if (!rdfa) {
+    if (dfas.empty()) {
         DEBUG_PRINTF("no dfa, pruned everything away\n");
         return nullptr;
     }
 
+    if (!mergeDfas(dfas, rm, cc)) {
+        dfas.clear();
+        return nullptr;
+    }
+
+    assert(dfas.size() == 1);
+    auto rdfa = std::move(dfas.front());
+    dfas.clear();
+
     DEBUG_PRINTF("building rdfa %p\n", rdfa.get());
 
     u32 start_offset;
@@ -957,7 +943,8 @@ set<ReportID> SmallWriteBuildImpl::all_reports() const {
     if (poisoned) {
         return reports;
     }
-    if (rdfa) {
+
+    for (const auto &rdfa : dfas) {
         insert(&reports, ::ue2::all_reports(*rdfa));
     }
 
diff --git a/src/smallwrite/smallwrite_dump.cpp b/src/smallwrite/smallwrite_dump.cpp
index bdf55c30..b2c33ecf 100644
--- a/src/smallwrite/smallwrite_dump.cpp
+++ b/src/smallwrite/smallwrite_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_dump_api.h"
 #include "nfa/nfa_internal.h"
+#include "util/dump_util.h"
 
 #include <cstdio>
 #include <string>
@@ -74,9 +75,8 @@ void smwrDumpNFA(const SmallWriteEngine *smwr, bool dump_raw,
     nfaGenerateDumpFiles(n, base + "smallwrite_nfa");
 
     if (dump_raw) {
-        FILE *f = fopen((base + "smallwrite_nfa.raw").c_str(), "w");
+        StdioFile f(base + "smallwrite_nfa.raw", "w");
         fwrite(n, 1, n->length, f);
-        fclose(f);
     }
 }
 
diff --git a/src/som/slot_manager.cpp b/src/som/slot_manager.cpp
index 3dc74d3d..d97e8fc1 100644
--- a/src/som/slot_manager.cpp
+++ b/src/som/slot_manager.cpp
@@ -40,6 +40,7 @@
 #include "nfagraph/ng_som_util.h"
 #include "nfagraph/ng_region.h"
 #include "util/charreach.h"
+#include "util/hash.h"
 #include "util/make_unique.h"
 #include "util/dump_charclass.h"
 #include "util/verify_types.h"
@@ -48,8 +49,6 @@
 #include <deque>
 #include <utility>
 
-#include <boost/functional/hash/hash.hpp>
-
 using namespace std;
 
 namespace ue2 {
@@ -67,13 +66,8 @@ SlotCacheEntry::SlotCacheEntry(const NGHolder &prefix_in,
 size_t SlotEntryHasher::operator()(const SlotCacheEntry &e) const {
     assert(e.prefix);
 
-    using boost::hash_combine;
-
-    size_t v = 0;
-    hash_combine(v, hash_holder(*e.prefix));
-    hash_combine(v, e.parent_slot);
-    hash_combine(v, e.is_reset);
-    hash_combine(v, e.escapes.hash());
+    size_t v = hash_all(hash_holder(*e.prefix), e.parent_slot,
+                        e.is_reset, e.escapes);
 
     DEBUG_PRINTF("%zu vertices, parent_slot=%u, escapes=%s, is_reset=%d "
                  "hashes to %zx\n", num_vertices(*e.prefix), e.parent_slot,
@@ -143,7 +137,7 @@ u32 SomSlotManager::getSomSlot(const NGHolder &prefix,
 
 u32 SomSlotManager::getInitialResetSomSlot(const NGHolder &prefix,
                 const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &region_map,
+                const unordered_map<NFAVertex, u32> &region_map,
                 u32 last_sent_region, bool *prefix_already_implemented) {
     DEBUG_PRINTF("getting initial reset; last sent region %u\n",
                  last_sent_region);
@@ -171,9 +165,9 @@ u32 SomSlotManager::getInitialResetSomSlot(const NGHolder &prefix,
     // Clone a copy of g (and its region map) that we will be able to store
     // later on.
     shared_ptr<NGHolder> gg = make_shared<NGHolder>();
-    ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
+    unordered_map<NFAVertex, NFAVertex> orig_to_copy;
     cloneHolder(*gg, g, &orig_to_copy);
-    ue2::unordered_map<NFAVertex, u32> gg_region_map;
+    unordered_map<NFAVertex, u32> gg_region_map;
     for (const auto &m : region_map) {
         assert(contains(region_map, m.first));
         gg_region_map.emplace(orig_to_copy.at(m.first), m.second);
diff --git a/src/som/slot_manager.h b/src/som/slot_manager.h
index ddb105f5..e5b2d794 100644
--- a/src/som/slot_manager.h
+++ b/src/som/slot_manager.h
@@ -38,10 +38,10 @@
 #include "nfagraph/ng_holder.h"
 #include "util/bytecode_ptr.h"
 #include "util/noncopyable.h"
-#include "util/ue2_containers.h"
 
 #include <deque>
 #include <memory>
+#include <unordered_map>
 
 struct NFA;
 
@@ -69,7 +69,7 @@ public:
     /** prefix must be acting as a resetting sentinel and should be a dag (if
      * not how are we establish som?) */
     u32 getInitialResetSomSlot(const NGHolder &prefix, const NGHolder &g,
-                           const ue2::unordered_map<NFAVertex, u32> &region_map,
+                           const std::unordered_map<NFAVertex, u32> &region_map,
                            u32 last_sent_region,
                            bool *prefix_already_implemented);
 
diff --git a/src/som/slot_manager_dump.cpp b/src/som/slot_manager_dump.cpp
index 484d6c14..4ed5cef0 100644
--- a/src/som/slot_manager_dump.cpp
+++ b/src/som/slot_manager_dump.cpp
@@ -36,10 +36,11 @@
 #include "nfagraph/ng_dump.h"
 #include "nfagraph/ng_is_equal.h"
 #include "util/container.h"
+#include "util/dump_util.h"
 #include "ue2common.h"
 
-#include <map>
 #include <cstdio>
+#include <map>
 #include <string>
 
 #ifndef DUMP_SUPPORT
@@ -55,7 +56,6 @@ void dumpSomSlotManager(const SomSlotManager &ssm, const Grey &grey) {
         return;
     }
 
-    string filename = grey.dumpPath + "/ssm.txt";
     map<u32, const SlotCacheEntry *> by_slot;
     map<u32, const InitialResetInfo *> by_slot_ir;
 
@@ -67,7 +67,7 @@ void dumpSomSlotManager(const SomSlotManager &ssm, const Grey &grey) {
         by_slot_ir[e.slot] = &e;
     }
 
-    FILE *f = fopen(filename.c_str(), "w");
+    StdioFile f(grey.dumpPath + "/ssm.txt", "w");
 
     fprintf(f, "slot width %u bytes\n\n", ssm.precision);
 
@@ -94,8 +94,6 @@ void dumpSomSlotManager(const SomSlotManager &ssm, const Grey &grey) {
         }
     }
 
-    fclose(f);
-
     for (const auto &h : ssm.cache->initial_prefixes) {
         dumpHolder(*h, hash_holder(*h), "ssm_prefix", grey);
     }
diff --git a/src/som/slot_manager_internal.h b/src/som/slot_manager_internal.h
index 46bfbe83..7e1fecc7 100644
--- a/src/som/slot_manager_internal.h
+++ b/src/som/slot_manager_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,10 +32,11 @@
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_is_equal.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
 #include "ue2common.h"
 
 #include <memory>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace ue2 {
@@ -43,14 +44,14 @@ namespace ue2 {
 struct InitialResetEntry {
     InitialResetEntry(std::shared_ptr<const NGHolder> sent_in,
                       std::shared_ptr<const NGHolder> body_in,
-                      const ue2::unordered_map<NFAVertex, u32> &body_regions_in,
+                      const std::unordered_map<NFAVertex, u32> &body_regions_in,
                       u32 sent_region_in, u32 first_bad_region_in)
         : sent(sent_in), body(body_in), body_regions(body_regions_in),
           sent_region(sent_region_in), first_bad_region(first_bad_region_in) {}
 
     std::shared_ptr<const NGHolder> sent;
     std::shared_ptr<const NGHolder> body;
-    ue2::unordered_map<NFAVertex, u32> body_regions;
+    std::unordered_map<NFAVertex, u32> body_regions;
     u32 sent_region;
     u32 first_bad_region; /* ~0U if it must cover the whole g */
 };
@@ -85,7 +86,7 @@ struct SlotEntryEqual {
 };
 
 struct SlotCache {
-    typedef ue2::unordered_set<SlotCacheEntry, SlotEntryHasher,
+    typedef std::unordered_set<SlotCacheEntry, SlotEntryHasher,
                                SlotEntryEqual> CacheStore;
 
     void insert(const NGHolder &prefix, const CharReach &escapes,
@@ -96,8 +97,8 @@ struct SlotCache {
 
     CacheStore store;
 
-    ue2::unordered_set<std::shared_ptr<const NGHolder>, NGHolderHasher,
-                       NGHolderEqual> initial_prefixes;
+    std::unordered_set<std::shared_ptr<const NGHolder>, NGHolderHasher,
+                  NGHolderEqual> initial_prefixes;
     std::vector<InitialResetInfo> initial_resets;
 };
 
diff --git a/src/stream_compress.c b/src/stream_compress.c
new file mode 100644
index 00000000..0cc782da
--- /dev/null
+++ b/src/stream_compress.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "stream_compress.h"
+
+#include "state.h"
+#include "nfa/nfa_internal.h"
+#include "rose/rose_internal.h"
+#include "util/multibit.h"
+#include "util/multibit_compress.h"
+#include "util/uniform_ops.h"
+
+#include <string.h>
+
+#define COPY_IN(p, sz) do {                             \
+        assert(currOffset + sz <= buf_size);            \
+        memcpy(buf + currOffset, p, sz);                \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define COPY_OUT(p, sz) do {                            \
+        if (currOffset + sz > buf_size) {               \
+            return 0;                                   \
+        }                                               \
+        memcpy(p, buf + currOffset, sz);                \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define SIZE_COPY_IN(p, sz) do {                        \
+        currOffset += sz;                               \
+        DEBUG_PRINTF("co = %zu\n", currOffset);         \
+    } while (0);
+
+#define COPY_MULTIBIT_IN(p, total_bits) do {                                \
+        size_t sz;                                                          \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset);              \
+        if (!mmbit_compress(bits, total_bits, comp, &sz,                    \
+                            buf_size - currOffset)) {                       \
+            return 0; /* error */                                           \
+        }                                                                   \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY_MULTIBIT_OUT(p, total_bits) do {                               \
+        size_t sz;                                                          \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset);              \
+        if (!mmbit_decompress(bits, total_bits, comp, &sz,                  \
+                              buf_size - currOffset)) {                     \
+            return 0; /* error */                                           \
+        }                                                                   \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY_MULTIBIT_SIZE(p, total_bits) do {                              \
+        STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p;                         \
+        size_t sz = mmbit_compsize(bits, total_bits);                       \
+        currOffset += sz;                                                   \
+        DEBUG_PRINTF("co = %zu\n", currOffset);                             \
+    } while (0);
+
+#define COPY COPY_OUT
+#define COPY_MULTIBIT COPY_MULTIBIT_OUT
+#define ASSIGN(lhs, rhs) do { lhs = rhs; } while (0)
+#define FN_SUFFIX expand
+#define STREAM_QUAL
+#define BUF_QUAL const
+#include "stream_compress_impl.h"
+
+int expand_stream(struct hs_stream *stream, const struct RoseEngine *rose,
+                  const char *buf, size_t buf_size) {
+    return sc_expand(rose, stream, buf, buf_size);
+}
+
+#define COPY COPY_IN
+#define COPY_MULTIBIT COPY_MULTIBIT_IN
+#define ASSIGN(lhs, rhs) do { } while (0)
+#define FN_SUFFIX compress
+#define STREAM_QUAL const
+#define BUF_QUAL
+#include "stream_compress_impl.h"
+
+size_t compress_stream(char *buf, size_t buf_size,
+                       const struct RoseEngine *rose,
+                       const struct hs_stream *stream) {
+    return sc_compress(rose, stream, buf, buf_size);
+}
+
+#define COPY SIZE_COPY_IN
+#define COPY_MULTIBIT COPY_MULTIBIT_SIZE
+#define ASSIGN(lhs, rhs) do { } while (0)
+#define FN_SUFFIX size
+#define STREAM_QUAL const
+#define BUF_QUAL UNUSED
+#include "stream_compress_impl.h"
+
+size_t size_compress_stream(const struct RoseEngine *rose,
+                            const struct hs_stream *stream) {
+    return sc_size(rose, stream, NULL, 0);
+}
diff --git a/src/stream_compress.h b/src/stream_compress.h
new file mode 100644
index 00000000..0d06d1e0
--- /dev/null
+++ b/src/stream_compress.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Functions for dynamic compress/expand for streams.
+ */
+
+#ifndef STREAM_COMPRESS_H
+#define STREAM_COMPRESS_H
+
+#include <stdlib.h>
+
+struct hs_stream;
+struct RoseEngine;
+
+int expand_stream(struct hs_stream *out, const struct RoseEngine *rose,
+                  const char *buf, size_t buf_size);
+
+size_t compress_stream(char *buf, size_t buf_size,
+                       const struct RoseEngine *rose,
+                       const struct hs_stream *src);
+
+size_t size_compress_stream(const struct RoseEngine *rose,
+                            const struct hs_stream *stream);
+
+#endif
diff --git a/src/stream_compress_impl.h b/src/stream_compress_impl.h
new file mode 100644
index 00000000..54aebd71
--- /dev/null
+++ b/src/stream_compress_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/join.h"
+
+#define COPY_FIELD(x) COPY(&x, sizeof(x))
+#define COPY_LEFTFIXES JOIN(sc_left_, FN_SUFFIX)
+#define COPY_SOM_INFO JOIN(sc_som_, FN_SUFFIX)
+
+static
+size_t COPY_LEFTFIXES(const struct RoseEngine *rose, size_t currOffset,
+                      STREAM_QUAL struct hs_stream *stream,
+                      BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    if (!rose->activeLeftIterOffset) {
+        return currOffset;
+    }
+
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    /* Note: in the expand case the active left array has already been copied
+     * into the stream. */
+    const u8 *ara = (const u8 *)(stream_body + so->activeLeftArray);
+    const u32 arCount = rose->activeLeftCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(rose);
+
+    /* We only want to look at non-transient leftfixes */
+    const struct mmbit_sparse_iter *it = getActiveLeftIter(rose);
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+    u32 dummy;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &dummy, it, si_state);
+    for (; ri != MMB_INVALID;
+         ri = mmbit_sparse_iter_next(ara, arCount, ri, &dummy, it, si_state)) {
+        u32 qi = ri + rose->leftfixBeginQueue;
+        UNUSED const struct LeftNfaInfo *left = left_table + ri;
+        const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, nfa_info);
+
+        COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize);
+        /* copy the one whole byte for active leftfixes as well */
+        assert(left->lagIndex != ROSE_OFFSET_INVALID);
+        COPY(stream_body + so->leftfixLagTable + left->lagIndex, 1);
+    }
+
+    return currOffset;
+}
+
+static
+size_t COPY_SOM_INFO(const struct RoseEngine *rose, size_t currOffset,
+                     STREAM_QUAL struct hs_stream *stream,
+                     BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+
+    if (!so->somLocation) {
+        assert(!so->somValid);
+        assert(!so->somWritable);
+        return currOffset;
+    }
+
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    assert(so->somValid);
+    assert(so->somWritable);
+
+    COPY_MULTIBIT(stream_body + so->somWritable, rose->somLocationCount);
+    COPY_MULTIBIT(stream_body + so->somValid, rose->somLocationCount);
+
+    /* Copy only the som slots which contain valid values. */
+    /* Note: in the expand case the som valid array has been copied in. */
+    const u8 *svalid = (const u8 *)(stream_body + so->somValid);
+    u32 s_count = rose->somLocationCount;
+    u32 s_width = rose->somHorizon;
+    for (u32 slot = mmbit_iterate(svalid, s_count, MMB_INVALID);
+         slot != MMB_INVALID; slot = mmbit_iterate(svalid, s_count, slot)) {
+        COPY(stream_body + so->somLocation + slot * s_width, s_width);
+    }
+
+    return currOffset;
+}
+
+static
+size_t JOIN(sc_, FN_SUFFIX)(const struct RoseEngine *rose,
+                            STREAM_QUAL struct hs_stream *stream,
+                            BUF_QUAL char *buf, UNUSED size_t buf_size) {
+    size_t currOffset = 0;
+    const struct RoseStateOffsets *so = &rose->stateOffsets;
+
+    STREAM_QUAL char *stream_body
+        = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream);
+
+    COPY_FIELD(stream->offset);
+    ASSIGN(stream->rose, rose);
+
+    COPY(stream_body + ROSE_STATE_OFFSET_STATUS_FLAGS, 1);
+    COPY_MULTIBIT(stream_body + ROSE_STATE_OFFSET_ROLE_MMBIT, rose->rolesWithStateCount);
+
+    /* stream is valid in compress/size, and stream->offset has been set already
+     * on the expand side */
+    u64a offset = stream->offset;
+    u32 history = MIN((u32)offset, rose->historyRequired);
+
+    /* copy the active mmbits */
+    COPY_MULTIBIT(stream_body + so->activeLeafArray, rose->activeArrayCount);
+    COPY_MULTIBIT(stream_body + so->activeLeftArray, rose->activeLeftCount);
+
+    COPY(stream_body + so->longLitState, so->longLitState_size);
+
+    /* Leftlag table will be handled later, for active leftfixes */
+
+    /* anchored table state is not required once we are deep in the stream */
+    if (offset <= rose->anchoredDistance) {
+        COPY(stream_body + so->anchorState, rose->anchorStateSize);
+    }
+
+    COPY(stream_body + so->groups, so->groups_size);
+
+    /* copy the real bits of history */
+    UNUSED u32 hend = so->history + rose->historyRequired;
+    COPY(stream_body + hend - history, history);
+
+    /* copy the exhaustion multibit */
+    COPY_MULTIBIT(stream_body + so->exhausted, rose->ekeyCount);
+
+    /* copy nfa stream state for endfixes */
+    /* Note: in the expand case the active array has already been copied into
+     * the stream. */
+    const u8 *aa = (const u8 *)(stream_body + so->activeLeafArray);
+    u32 aaCount = rose->activeArrayCount;
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("saving stream state for qi=%u\n", qi);
+        const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, nfa_info);
+        COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize);
+    }
+
+    /* copy nfa stream state for leftfixes */
+    currOffset = COPY_LEFTFIXES(rose, currOffset, stream, buf, buf_size);
+    if (!currOffset) {
+        return 0;
+    }
+
+    currOffset = COPY_SOM_INFO(rose, currOffset, stream, buf, buf_size);
+    if (!currOffset) {
+        return 0;
+    }
+
+    return currOffset;
+}
+
+#undef ASSIGN
+#undef COPY
+#undef COPY_FIELD
+#undef COPT_LEFTFIXES
+#undef COPY_MULTIBIT
+#undef COPY_SOM_INFO
+#undef FN_SUFFIX
+#undef BUF_QUAL
+#undef STREAM_QUAL
diff --git a/src/util/accel_scheme.h b/src/util/accel_scheme.h
index f524fe93..2a067b30 100644
--- a/src/util/accel_scheme.h
+++ b/src/util/accel_scheme.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 #define ACCEL_SCHEME_H
 
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 
 #include <utility>
 
@@ -39,7 +39,7 @@ namespace ue2 {
 #define MAX_ACCEL_DEPTH 4
 
 struct AccelScheme {
-    flat_set<std::pair<u8, u8> > double_byte;
+    flat_set<std::pair<u8, u8>> double_byte;
     CharReach cr = CharReach::dot();
     CharReach double_cr;
     u32 offset = MAX_ACCEL_DEPTH + 1;
diff --git a/src/util/bitfield.h b/src/util/bitfield.h
index a71c1f88..24c0c580 100644
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,12 +36,12 @@
 #include "ue2common.h"
 #include "popcount.h"
 #include "util/bitutils.h"
+#include "util/hash.h"
 
 #include <array>
 #include <cassert>
 
 #include <boost/dynamic_bitset.hpp>
-#include <boost/functional/hash/hash.hpp>
 
 namespace ue2 {
 
@@ -373,7 +373,7 @@ public:
 
     /// Simple hash.
     size_t hash() const {
-        return boost::hash_range(std::begin(bits), std::end(bits));
+        return ue2_hasher()(bits);
     }
 
     /// Sentinel value meaning "no more bits", used by find_first and
@@ -420,12 +420,17 @@ private:
     std::array<block_type, num_blocks> bits;
 };
 
-/** \brief Boost-style hash free function. */
-template<size_t requested_size>
-size_t hash_value(const bitfield<requested_size> &b) {
-    return b.hash();
-}
-
 } // namespace ue2
 
+namespace std {
+
+template<size_t requested_size>
+struct hash<ue2::bitfield<requested_size>> {
+    size_t operator()(const ue2::bitfield<requested_size> &b) const {
+        return b.hash();
+    }
+};
+
+} // namespace std
+
 #endif // BITFIELD_H
diff --git a/src/util/charreach.h b/src/util/charreach.h
index 53f2a5d2..f6d3a2af 100644
--- a/src/util/charreach.h
+++ b/src/util/charreach.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -196,12 +196,17 @@ bool isSubsetOf(const CharReach &small, const CharReach &big);
 bool isutf8ascii(const CharReach &cr);
 bool isutf8start(const CharReach &cr);
 
-/** \brief Boost-style hash free function. */
-static really_inline
-size_t hash_value(const CharReach &cr) {
-    return cr.hash();
-}
-
 } // namespace ue2
 
+namespace std {
+
+template<>
+struct hash<ue2::CharReach> {
+    size_t operator()(const ue2::CharReach &cr) const {
+        return cr.hash();
+    }
+};
+
+} // namespace std
+
 #endif // NG_CHARREACH_H
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
index 79f06932..c2befea4 100644
--- a/src/util/clique.cpp
+++ b/src/util/clique.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@
 #include "container.h"
 #include "graph_range.h"
 #include "make_unique.h"
-#include "ue2_containers.h"
 
 #include <map>
 #include <set>
diff --git a/src/util/depth.h b/src/util/depth.h
index 9af1ded8..5305c6f1 100644
--- a/src/util/depth.h
+++ b/src/util/depth.h
@@ -221,8 +221,8 @@ public:
     std::string str() const;
 #endif
 
-    friend size_t hash_value(const depth &d) {
-        return d.val;
+    size_t hash() const {
+        return val;
     }
 
 private:
@@ -260,10 +260,6 @@ struct DepthMinMax : totally_ordered<DepthMinMax> {
 
 };
 
-inline size_t hash_value(const DepthMinMax &d) {
-    return hash_all(d.min, d.max);
-}
-
 /**
  * \brief Merge two DepthMinMax values together to produce their union.
  */
@@ -271,4 +267,22 @@ DepthMinMax unionDepthMinMax(const DepthMinMax &a, const DepthMinMax &b);
 
 } // namespace ue2
 
+namespace std {
+
+template<>
+struct hash<ue2::depth> {
+    size_t operator()(const ue2::depth &d) const {
+        return d.hash();
+    }
+};
+
+template<>
+struct hash<ue2::DepthMinMax> {
+    size_t operator()(const ue2::DepthMinMax &d) const {
+        return hash_all(d.min, d.max);
+    }
+};
+
+} // namespace
+
 #endif // DEPTH_H
diff --git a/src/util/determinise.h b/src/util/determinise.h
index d7bb592b..102a1974 100644
--- a/src/util/determinise.h
+++ b/src/util/determinise.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,14 +38,13 @@
 #include "container.h"
 #include "ue2common.h"
 
-#include <array>
 #include <algorithm>
+#include <array>
+#include <queue>
 #include <vector>
 
 namespace ue2 {
 
-#define DETERMINISE_RESERVE_SIZE 10
-
 /* Automaton details:
  *
  * const vector<StateSet> initial()
@@ -73,42 +72,43 @@ namespace ue2 {
  *  \param state_limit limit on the number of dfa states to construct
  *  \param statesets_out a mapping from DFA state to the set of NFA states in
  *         the automaton
- *  \return zero on success
+ *  \return true on success, false if state limit exceeded
  */
 template<class Auto, class ds>
 never_inline
-int determinise(Auto &n, std::vector<ds> &dstates_out, dstate_id_t state_limit,
+bool determinise(Auto &n, std::vector<ds> &dstates, size_t state_limit,
                 std::vector<typename Auto::StateSet> *statesets_out = nullptr) {
     DEBUG_PRINTF("the determinator\n");
-    typedef typename Auto::StateSet StateSet;
-    typedef typename Auto::StateMap DstateIdMap;
-    DstateIdMap dstate_ids;
-    std::vector<StateSet> statesets;
+    using StateSet = typename Auto::StateSet;
+    typename Auto::StateMap dstate_ids;
 
     const size_t alphabet_size = n.alphasize;
 
-    std::vector<ds> dstates;
-    dstates.reserve(DETERMINISE_RESERVE_SIZE);
-    statesets.reserve(DETERMINISE_RESERVE_SIZE);
+    dstates.clear();
+    dstates.reserve(state_limit);
 
-    dstate_ids[n.dead] = DEAD_STATE;
+    dstate_ids.emplace(n.dead, DEAD_STATE);
     dstates.push_back(ds(alphabet_size));
     std::fill_n(dstates[0].next.begin(), alphabet_size, DEAD_STATE);
 
-    statesets.push_back(n.dead);
+    std::queue<std::pair<StateSet, dstate_id_t>> q;
+    q.emplace(n.dead, DEAD_STATE);
 
     const std::vector<StateSet> &init = n.initial();
     for (u32 i = 0; i < init.size(); i++) {
-        statesets.push_back(init[i]);
+        q.emplace(init[i], dstates.size());
         assert(!contains(dstate_ids, init[i]));
-        dstate_ids[init[i]] = dstates.size();
+        dstate_ids.emplace(init[i], dstates.size());
         dstates.push_back(ds(alphabet_size));
     }
 
     std::vector<StateSet> succs(alphabet_size, n.dead);
-    for (dstate_id_t curr_id = DEAD_STATE; curr_id < dstates.size();
-         curr_id++) {
-        StateSet &curr = statesets[curr_id];
+
+    while (!q.empty()) {
+        auto m = std::move(q.front());
+        q.pop();
+        StateSet &curr = m.first;
+        dstate_id_t curr_id = m.second;
 
         DEBUG_PRINTF("curr: %hu\n", curr_id);
 
@@ -139,43 +139,48 @@ int determinise(Auto &n, std::vector<ds> &dstates_out, dstate_id_t state_limit,
             if (s && succs[s] == succs[s - 1]) {
                 succ_id = dstates[curr_id].next[s - 1];
             } else {
-                typename DstateIdMap::const_iterator dstate_id_iter;
-                dstate_id_iter = dstate_ids.find(succs[s]);
-
-                if (dstate_id_iter != dstate_ids.end()) {
-                    succ_id = dstate_id_iter->second;
-
+                auto p = dstate_ids.find(succs[s]);
+                if (p != dstate_ids.end()) { // succ[s] is already present
+                    succ_id = p->second;
                     if (succ_id > curr_id && !dstates[succ_id].daddy
                         && n.unalpha[s] < N_CHARS) {
                         dstates[succ_id].daddy = curr_id;
                     }
                 } else {
-                    statesets.push_back(succs[s]);
-                    succ_id = dstates.size();
-                    dstate_ids[succs[s]] = succ_id;
+                    succ_id = dstate_ids.size();
+                    dstate_ids.emplace(succs[s], succ_id);
                     dstates.push_back(ds(alphabet_size));
                     dstates.back().daddy = n.unalpha[s] < N_CHARS ? curr_id : 0;
+                    q.emplace(succs[s], succ_id);
                 }
 
                 DEBUG_PRINTF("-->%hu on %02hx\n", succ_id, n.unalpha[s]);
             }
 
             if (succ_id >= state_limit) {
-                DEBUG_PRINTF("succ_id %hu >= state_limit %hu\n",
+                DEBUG_PRINTF("succ_id %hu >= state_limit %zu\n",
                              succ_id, state_limit);
-                return -2;
+                dstates.clear();
+                return false;
             }
 
             dstates[curr_id].next[s] = succ_id;
         }
     }
 
-    dstates_out = dstates;
+    // The dstates vector will persist in the raw_dfa.
+    dstates.shrink_to_fit();
+
     if (statesets_out) {
-        statesets_out->swap(statesets);
+        auto &statesets = *statesets_out;
+        statesets.resize(dstate_ids.size());
+        for (auto &m : dstate_ids) {
+            statesets[m.second] = std::move(m.first);
+        }
     }
+
     DEBUG_PRINTF("ok\n");
-    return 0;
+    return true;
 }
 
 static inline
diff --git a/src/util/dump_util.h b/src/util/dump_util.h
index f5ebe94a..dc352c28 100644
--- a/src/util/dump_util.h
+++ b/src/util/dump_util.h
@@ -29,7 +29,11 @@
 #ifndef DUMP_UTIL
 #define DUMP_UTIL
 
+#include "noncopyable.h"
+
 #include <cstdio>
+#include <memory>
+#include <string>
 
 namespace ue2 {
 
@@ -38,6 +42,22 @@ namespace ue2 {
  */
 FILE *fopen_or_throw(const char *path, const char *mode);
 
+/**
+ * \brief Helper class: wraps C stdio FILE* handle and takes care of closing
+ * the file on destruction.
+ */
+class StdioFile : noncopyable {
+public:
+    StdioFile(const std::string &filename, const char *mode)
+        : handle(fopen_or_throw(filename.c_str(), mode), &fclose) {}
+
+    // Implicit conversion to FILE* for use by stdio calls.
+    operator FILE *() { return handle.get(); }
+
+private:
+    std::unique_ptr<FILE, decltype(&fclose)> handle;
+};
+
 } // namespace ue2
 
 #endif
diff --git a/src/util/ue2_containers.h b/src/util/flat_containers.h
similarity index 96%
rename from src/util/ue2_containers.h
rename to src/util/flat_containers.h
index d345a4fa..41452eb4 100644
--- a/src/util/ue2_containers.h
+++ b/src/util/flat_containers.h
@@ -26,10 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef UTIL_UE2_CONTAINERS_H_
-#define UTIL_UE2_CONTAINERS_H_
+#ifndef UTIL_FLAT_CONTAINERS_H
+#define UTIL_FLAT_CONTAINERS_H
 
 #include "ue2common.h"
+#include "util/hash.h"
 #include "util/operators.h"
 #include "util/small_vector.h"
 
@@ -38,19 +39,10 @@
 #include <type_traits>
 #include <utility>
 
-#include <boost/functional/hash/hash_fwd.hpp>
 #include <boost/iterator/iterator_facade.hpp>
-#include <boost/unordered/unordered_map.hpp>
-#include <boost/unordered/unordered_set.hpp>
 
 namespace ue2 {
 
-/** \brief Unordered set container implemented internally as a hash table. */
-using boost::unordered_set;
-
-/** \brief Unordered map container implemented internally as a hash table. */
-using boost::unordered_map;
-
 namespace flat_detail {
 
 // Iterator facade that wraps an underlying iterator, so that we get our
@@ -363,11 +355,6 @@ public:
     friend void swap(flat_set &a, flat_set &b) {
         a.swap(b);
     }
-
-    // Free hash function.
-    friend size_t hash_value(const flat_set &a) {
-        return boost::hash_range(a.begin(), a.end());
-    }
 };
 
 /**
@@ -652,13 +639,26 @@ public:
     friend void swap(flat_map &a, flat_map &b) {
         a.swap(b);
     }
+};
 
-    // Free hash function.
-    friend size_t hash_value(const flat_map &a) {
-        return boost::hash_range(a.begin(), a.end());
+} // namespace ue2
+
+namespace std {
+
+template<typename T, typename Compare, typename Allocator>
+struct hash<ue2::flat_set<T, Compare, Allocator>> {
+    size_t operator()(const ue2::flat_set<T, Compare, Allocator> &f) {
+        return ue2::ue2_hasher()(f);
     }
 };
 
-} // namespace
+template<typename Key, typename T, typename Compare, typename Allocator>
+struct hash<ue2::flat_map<Key, T, Compare, Allocator>> {
+    size_t operator()(const ue2::flat_map<Key, T, Compare, Allocator> &f) {
+        return ue2::ue2_hasher()(f);
+    }
+};
 
-#endif // UTIL_UE2_CONTAINERS_H_
+} // namespace std
+
+#endif // UTIL_FLAT_CONTAINERS_H
diff --git a/src/util/graph.h b/src/util/graph.h
index 4c2876f1..660afd02 100644
--- a/src/util/graph.h
+++ b/src/util/graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,8 +35,9 @@
 
 #include "container.h"
 #include "ue2common.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"
 
 #include <boost/graph/depth_first_search.hpp>
 #include <boost/graph/strong_components.hpp>
@@ -115,7 +116,7 @@ bool has_proper_successor(const typename Graph::vertex_descriptor &v,
 template<class Graph, class SourceCont, class OutCont>
 void find_reachable(const Graph &g, const SourceCont &sources, OutCont *out) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
-    ue2::unordered_map<vertex_descriptor, boost::default_color_type> colours;
+    std::unordered_map<vertex_descriptor, boost::default_color_type> colours;
 
     for (auto v : sources) {
         boost::depth_first_visit(g, v,
@@ -133,7 +134,7 @@ void find_reachable(const Graph &g, const SourceCont &sources, OutCont *out) {
 template<class Graph, class SourceCont, class OutCont>
 void find_unreachable(const Graph &g, const SourceCont &sources, OutCont *out) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
-    ue2::unordered_set<vertex_descriptor> reachable;
+    std::unordered_set<vertex_descriptor> reachable;
 
     find_reachable(g, sources, &reachable);
 
@@ -145,7 +146,7 @@ void find_unreachable(const Graph &g, const SourceCont &sources, OutCont *out) {
 }
 
 template <class Graph>
-ue2::flat_set<typename Graph::vertex_descriptor>
+flat_set<typename Graph::vertex_descriptor>
 find_vertices_in_cycles(const Graph &g) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
 
@@ -159,7 +160,7 @@ find_vertices_in_cycles(const Graph &g) {
         comps[e.second].push_back(e.first);
     }
 
-    ue2::flat_set<vertex_descriptor> rv;
+    flat_set<vertex_descriptor> rv;
 
     for (const auto &comp : comps | boost::adaptors::map_values) {
         /* every vertex in a strongly connected component is reachable from
@@ -182,7 +183,8 @@ find_vertices_in_cycles(const Graph &g) {
 template <class Graph>
 bool has_parallel_edge(const Graph &g) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
-    ue2::unordered_set<std::pair<vertex_descriptor, vertex_descriptor>> seen;
+    ue2_unordered_set<std::pair<vertex_descriptor, vertex_descriptor>> seen;
+
     for (const auto &e : edges_range(g)) {
         auto u = source(e, g);
         auto v = target(e, g);
@@ -235,6 +237,29 @@ vertex_recorder<Cont> make_vertex_recorder(Cont &o) {
     return vertex_recorder<Cont>(o);
 }
 
+/**
+ * \brief A vertex recorder visitor that sets the bits in the given bitset
+ * type (e.g. boost::dynamic_bitset) corresponding to the indices of the
+ * vertices encountered.
+ */
+template<typename Bitset>
+class vertex_index_bitset_recorder : public boost::default_dfs_visitor {
+public:
+    explicit vertex_index_bitset_recorder(Bitset &o) : out(o) {}
+    template<class Graph>
+    void discover_vertex(typename Graph::vertex_descriptor v, const Graph &g) {
+        assert(g[v].index < out.size());
+        out.set(g[v].index);
+    }
+    Bitset &out;
+};
+
+template<typename Bitset>
+vertex_index_bitset_recorder<Bitset>
+make_vertex_index_bitset_recorder(Bitset &o) {
+    return vertex_index_bitset_recorder<Bitset>(o);
+}
+
 template <class Graph>
 std::pair<typename Graph::edge_descriptor, bool>
 add_edge_if_not_present(typename Graph::vertex_descriptor u,
diff --git a/src/util/graph_small_color_map.h b/src/util/graph_small_color_map.h
new file mode 100644
index 00000000..03e61cf4
--- /dev/null
+++ b/src/util/graph_small_color_map.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \brief Small Color Map: implements a property map designed to represent
+ * colors using minimal memory (two bits per index).
+ *
+ * This is based on the Boost BGL two_bit_color_map, but provides some extra
+ * functionality (such as a fill operation).
+ */
+
+#ifndef GRAPH_SMALL_COLOR_MAP_H
+#define GRAPH_SMALL_COLOR_MAP_H
+
+#include "ue2common.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+namespace ue2 {
+
+enum class small_color : u8 {
+    white = 0,
+    gray = 1,
+    black = 2
+    // Note: we have room for one more colour.
+};
+
+} // namespace ue2
+
+namespace boost {
+
+/** \brief Specialisation of boost::color_traits for small_color. */
+template<>
+struct color_traits<ue2::small_color> {
+    static ue2::small_color white() { return ue2::small_color::white; }
+    static ue2::small_color gray() { return ue2::small_color::gray; }
+    static ue2::small_color black() { return ue2::small_color::black; }
+};
+
+} // namespace boost
+
+namespace ue2 {
+
+static constexpr u8 fill_lut[] = {
+    0,    // white
+    0x55, // gray
+    0xaa, // black
+};
+
+/**
+ * \brief Small Color Map: implements a property map designed to represent
+ * colors using minimal memory (two bits per index).
+ *
+ * If your graph type provides an index map in get(vertex_index, g), you can
+ * use make_small_color_map() to construct this.
+ */
+template<typename IndexMap>
+class small_color_map {
+    size_t n;
+    IndexMap index_map;
+
+    // This class is passed by value into (potentially recursive) BGL
+    // algorithms, so we use a shared_ptr to keep the copy lightweight and
+    // ensure that data is correctly destroyed.
+    std::shared_ptr<std::vector<u8>> data;
+
+    static constexpr size_t bit_size = 2;
+    static constexpr size_t entries_per_byte = (sizeof(u8) * 8) / bit_size;
+    static constexpr u8 bit_mask = (1U << bit_size) - 1;
+
+public:
+    using key_type = typename boost::property_traits<IndexMap>::key_type;
+    using value_type = small_color;
+    using reference = small_color;
+    using category = boost::read_write_property_map_tag;
+
+    small_color_map(size_t n_in, const IndexMap &index_map_in)
+        : n(n_in), index_map(index_map_in) {
+        size_t num_bytes = (n + entries_per_byte - 1) / entries_per_byte;
+        data = std::make_shared<std::vector<unsigned char>>(num_bytes);
+        fill(small_color::white);
+    }
+
+    void fill(small_color color) {
+        assert(static_cast<u8>(color) < sizeof(fill_lut));
+        u8 val = fill_lut[static_cast<u8>(color)];
+        std::memset(data->data(), val, data->size());
+    }
+
+    small_color get_impl(key_type key) const {
+        auto i = get(index_map, key);
+        assert(i < n);
+        size_t byte = i / entries_per_byte;
+        assert(byte < data->size());
+        size_t bit = (i % entries_per_byte) * bit_size;
+        u8 val = ((*data)[byte] >> bit) & bit_mask;
+        return static_cast<small_color>(val);
+    }
+
+    void put_impl(key_type key, small_color color) {
+        auto i = get(index_map, key);
+        assert(i < n);
+        size_t byte = i / entries_per_byte;
+        assert(byte < data->size());
+        size_t bit = (i % entries_per_byte) * bit_size;
+        auto &block = (*data)[byte];
+        u8 val = static_cast<u8>(color);
+        block = (block & ~(bit_mask << bit)) | (val << bit);
+    }
+};
+
+template<typename IndexMap>
+small_color get(const small_color_map<IndexMap> &color_map,
+                typename boost::property_traits<IndexMap>::key_type key) {
+    return color_map.get_impl(key);
+}
+
+template<typename IndexMap>
+void put(small_color_map<IndexMap> &color_map,
+         typename boost::property_traits<IndexMap>::key_type key,
+         small_color val) {
+    color_map.put_impl(key, val);
+}
+
+template<typename Graph>
+auto make_small_color_map(const Graph &g)
+    -> small_color_map<decltype(get(vertex_index, g))> {
+    return small_color_map<decltype(get(vertex_index, g))>(
+        num_vertices(g), get(vertex_index, g));
+}
+
+} // namespace ue2
+
+#endif // GRAPH_SMALL_COLOR_MAP_H
diff --git a/src/util/hash.h b/src/util/hash.h
index 6f76e43d..60bc670a 100644
--- a/src/util/hash.h
+++ b/src/util/hash.h
@@ -34,16 +34,133 @@
 #ifndef UTIL_HASH_H
 #define UTIL_HASH_H
 
-#include <iterator>
-#include <boost/functional/hash/hash_fwd.hpp>
+#include <functional>
+#include <string>
+#include <type_traits>
+#include <utility>
 
 namespace ue2 {
 
 namespace hash_detail {
 
+inline
+void hash_combine_impl(size_t &seed, size_t value) {
+    // Note: constants explicitly truncated on 32-bit platforms.
+    const size_t a = (size_t)0x0b4e0ef37bc32127ULL;
+    const size_t b = (size_t)0x318f07b0c8eb9be9ULL;
+    seed ^= value * a;
+    seed += b;
+}
+
+/** \brief Helper that determines whether std::begin() exists for T. */
+template<typename T>
+struct is_container_check {
+private:
+    template<typename C>
+    static auto has_begin_function(const C &obj) -> decltype(std::begin(obj)) {
+        return std::begin(obj);
+    }
+    static void has_begin_function(...) {
+        return;
+    }
+    using has_begin_type = decltype(has_begin_function(std::declval<T>()));
+
+public:
+    static const bool value = !std::is_void<has_begin_type>::value;
+};
+
+/** \brief Type trait to enable on whether T is a container. */
+template<typename T>
+struct is_container
+    : public ::std::integral_constant<bool, is_container_check<T>::value> {};
+
+/** \brief Helper that determines whether T::hash() exists. */
+template<typename T>
+struct has_hash_member_check {
+private:
+    template<typename C>
+    static auto has_hash_member_function(const C &obj) -> decltype(obj.hash()) {
+        return obj.hash();
+    }
+    static void has_hash_member_function(...) {
+        return;
+    }
+    using has_hash = decltype(has_hash_member_function(std::declval<T>()));
+
+public:
+    static const bool value = !std::is_void<has_hash>::value;
+};
+
+/** \brief Type trait to enable on whether T::hash() exists. */
+template<typename T>
+struct has_hash_member
+    : public ::std::integral_constant<bool, has_hash_member_check<T>::value> {};
+
+/** \brief Default hash: falls back to std::hash. */
+template<typename T, typename Enable = void>
+struct ue2_hash {
+    using decayed_type = typename std::decay<T>::type;
+    size_t operator()(const T &obj) const {
+        return std::hash<decayed_type>()(obj);
+    }
+};
+
+/** \brief Hash for std::pair. */
+template<typename A, typename B>
+struct ue2_hash<std::pair<A, B>, void> {
+    size_t operator()(const std::pair<A, B> &p) const {
+        size_t v = 0;
+        hash_combine_impl(v, ue2_hash<A>()(p.first));
+        hash_combine_impl(v, ue2_hash<B>()(p.second));
+        return v;
+    }
+};
+
+/** \brief Hash for any type that has a hash() member function. */
+template<typename T>
+struct ue2_hash<T, typename std::enable_if<has_hash_member<T>::value>::type> {
+    size_t operator()(const T &obj) const {
+        return obj.hash();
+    }
+};
+
+/**
+ * \brief Hash for any container type that supports std::begin().
+ *
+ * We exempt std::string as std::hash<std:string> is provided and quicker.
+ */
+template<typename T>
+struct ue2_hash<T, typename std::enable_if<
+           is_container<T>::value &&
+           !std::is_same<typename std::decay<T>::type, std::string>::value &&
+           !has_hash_member<T>::value>::type> {
+    size_t operator()(const T &obj) const {
+        size_t v = 0;
+        for (const auto &elem : obj) {
+            using element_type = typename std::decay<decltype(elem)>::type;
+            hash_combine_impl(v, ue2_hash<element_type>()(elem));
+        }
+        return v;
+    }
+};
+
+/** \brief Hash for enum types. */
+template<typename T>
+struct ue2_hash<T, typename std::enable_if<std::is_enum<T>::value>::type> {
+    size_t operator()(const T &obj) const {
+        using utype = typename std::underlying_type<T>::type;
+        return ue2_hash<utype>()(static_cast<utype>(obj));
+    }
+};
+
+template<typename T>
+void hash_combine(size_t &seed, const T &obj) {
+    hash_combine_impl(seed, ue2_hash<T>()(obj));
+}
+
 template<typename T>
 void hash_build(size_t &v, const T &obj) {
-    boost::hash_combine(v, obj);
+    hash_combine(v, obj);
 }
 
 template<typename T, typename... Args>
@@ -54,6 +171,21 @@ void hash_build(size_t &v, const T &obj, Args&&... args) {
 
 } // namespace hash_detail
 
+using hash_detail::hash_combine;
+
+/**
+ * \brief Hasher for general use.
+ *
+ * Provides operators for most standard containers and falls back to
+ * std::hash<T>.
+ */
+struct ue2_hasher {
+    template<typename T>
+    size_t operator()(const T &obj) const {
+        return hash_detail::ue2_hash<T>()(obj);
+    }
+};
+
 /**
  * \brief Computes the combined hash of all its arguments.
  *
@@ -70,15 +202,6 @@ size_t hash_all(Args&&... args) {
     return v;
 }
 
-/**
- * \brief Compute the hash of all the elements of any range on which we can
- * call std::begin() and std::end().
- */
-template<typename Range>
-size_t hash_range(const Range &r) {
-    return boost::hash_range(std::begin(r), std::end(r));
-}
-
 } // namespace ue2
 
 #endif // UTIL_HASH_H
diff --git a/src/util/hash_dynamic_bitset.h b/src/util/hash_dynamic_bitset.h
index 315aed34..65bc29c3 100644
--- a/src/util/hash_dynamic_bitset.h
+++ b/src/util/hash_dynamic_bitset.h
@@ -34,8 +34,9 @@
 #ifndef UTIL_HASH_DYNAMIC_BITSET_H
 #define UTIL_HASH_DYNAMIC_BITSET_H
 
+#include "hash.h"
+
 #include <boost/dynamic_bitset.hpp>
-#include <boost/functional/hash/hash.hpp>
 
 #include <iterator>
 
@@ -68,7 +69,7 @@ struct hash_output_it {
 
         template<typename T>
         void operator=(const T &val) const {
-            boost::hash_combine(*out, val);
+            hash_combine(*out, val);
         }
 
     private:
diff --git a/src/util/insertion_ordered.h b/src/util/insertion_ordered.h
new file mode 100644
index 00000000..2067d350
--- /dev/null
+++ b/src/util/insertion_ordered.h
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_INSERTION_ORDERED_H
+#define UTIL_INSERTION_ORDERED_H
+
+/**
+ * \file
+ * \brief Insertion-ordered associative containers (set, map).
+ */
+
+#include "util/operators.h"
+#include "util/unordered.h"
+
+#include <cassert>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <boost/iterator/iterator_facade.hpp>
+
+namespace ue2 {
+
+namespace insertion_ordered_detail {
+
+// Iterator facade that wraps an underlying iterator, so that we get our
+// own iterator types.
+template<class WrappedIter, class Value>
+class iter_wrapper
+    : public boost::iterator_facade<iter_wrapper<WrappedIter, Value>, Value,
+                                    boost::random_access_traversal_tag> {
+public:
+    iter_wrapper() = default;
+    explicit iter_wrapper(WrappedIter it_in) : it(std::move(it_in)) {}
+
+    // Templated copy-constructor to allow for interoperable iterator and
+    // const_iterator.
+    template<class, class> friend class iter_wrapper;
+
+    template<class OtherIter, class OtherValue>
+    iter_wrapper(iter_wrapper<OtherIter, OtherValue> other,
+                 typename std::enable_if<std::is_convertible<
+                     OtherIter, WrappedIter>::value>::type * = nullptr)
+        : it(std::move(other.it)) {}
+
+    WrappedIter get() const { return it; }
+
+private:
+    friend class boost::iterator_core_access;
+
+    WrappedIter it;
+
+    void increment() { ++it; }
+    void decrement() { --it; }
+    void advance(size_t n) { it += n; }
+    typename std::iterator_traits<WrappedIter>::difference_type
+    distance_to(const iter_wrapper &other) const {
+        return other.it - it;
+    }
+    bool equal(const iter_wrapper &other) const { return it == other.it; }
+    Value &dereference() const { return *it; }
+};
+
+template<class Key, class Element>
+class element_store {
+    std::vector<Element> data;
+    ue2_unordered_map<Key, size_t> map;
+
+public:
+    bool empty() const {
+        return data.empty();
+    }
+
+    size_t size() const {
+        assert(data.size() == map.size());
+        return data.size();
+    }
+
+    void clear() {
+        data.clear();
+        map.clear();
+    }
+
+    void reserve(size_t n) {
+        data.reserve(n);
+        map.reserve(n);
+    }
+
+    // Iteration.
+
+    using const_iterator =
+        iter_wrapper<typename std::vector<Element>::const_iterator,
+                     const Element>;
+    using iterator =
+        iter_wrapper<typename std::vector<Element>::iterator, Element>;
+
+    const_iterator begin() const {
+        return const_iterator(data.begin());
+    }
+
+    const_iterator end() const {
+        return const_iterator(data.end());
+    }
+
+    iterator begin() {
+        return iterator(data.begin());
+    }
+
+    iterator end() {
+        return iterator(data.end());
+    }
+
+    // Search.
+
+    const_iterator find(const Key &key) const {
+        auto map_it = map.find(key);
+        if (map_it == map.end()) {
+            return end();
+        }
+        auto idx = map_it->second;
+        assert(idx < data.size());
+        return begin() + idx;
+    }
+
+    iterator find(const Key &key) {
+        auto map_it = map.find(key);
+        if (map_it == map.end()) {
+            return end();
+        }
+        auto idx = map_it->second;
+        assert(idx < data.size());
+        return begin() + idx;
+    }
+
+    // Insert.
+
+    std::pair<iterator, bool> insert(const Key &key, const Element &element) {
+        const auto idx = data.size();
+        if (map.emplace(key, idx).second) {
+            data.push_back(element);
+            return {begin() + idx, true};
+        }
+        return {end(), false};
+    }
+
+    bool operator==(const element_store &a) const {
+        return data == a.data;
+    }
+
+    bool operator<(const element_store &a) const {
+        return data < a.data;
+    }
+
+    void swap(element_store &a) {
+        using std::swap;
+        swap(data, a.data);
+        swap(map, a.map);
+    }
+};
+
+} // namespace insertion_ordered_detail
+
+template<class Key, class Value>
+class insertion_ordered_map
+    : public totally_ordered<insertion_ordered_map<Key, Value>> {
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = std::pair<const Key, Value>;
+
+private:
+    using store_type = insertion_ordered_detail::element_store<Key, value_type>;
+    store_type store;
+
+public:
+    using const_iterator = typename store_type::const_iterator;
+    using iterator = typename store_type::iterator;
+
+    insertion_ordered_map() = default;
+
+    template<class Iter>
+    insertion_ordered_map(Iter it, Iter it_end) {
+        insert(it, it_end);
+    }
+
+    explicit insertion_ordered_map(std::initializer_list<value_type> init) {
+        insert(init.begin(), init.end());
+    }
+
+    const_iterator begin() const { return store.begin(); }
+    const_iterator end() const { return store.end(); }
+    iterator begin() { return store.begin(); }
+    iterator end() { return store.end(); }
+
+    const_iterator find(const Key &key) const {
+        return store.find(key);
+    }
+
+    iterator find(const Key &key) {
+        return store.find(key);
+    }
+
+    std::pair<iterator, bool> insert(const std::pair<const Key, Value> &p) {
+        return store.insert(p.first, p);
+    }
+
+    template<class Iter>
+    void insert(Iter it, Iter it_end) {
+        for (; it != it_end; ++it) {
+            insert(*it);
+        }
+    }
+
+    Value &operator[](const Key &key) {
+        auto it = find(key);
+        if (it == end()) {
+            it = insert({key, Value{}}).first;
+        }
+        return it->second;
+    }
+
+    const Value &at(const Key &key) const {
+        return find(key)->second;
+    }
+
+    Value &at(const Key &key) {
+        return find(key)->second;
+    }
+
+    bool empty() const {
+        return store.empty();
+    }
+
+    size_t size() const {
+        return store.size();
+    }
+
+    void clear() {
+        store.clear();
+    }
+
+    void reserve(size_t n) {
+        store.reserve(n);
+    }
+
+    bool operator==(const insertion_ordered_map &a) const {
+        return store == a.store;
+    }
+
+    bool operator<(const insertion_ordered_map &a) const {
+        return store < a.store;
+    }
+
+    void swap(insertion_ordered_map &a) {
+        store.swap(a.store);
+    }
+
+    friend void swap(insertion_ordered_map &a, insertion_ordered_map &b) {
+        a.swap(b);
+    }
+};
+
+template<class Key>
+class insertion_ordered_set
+    : public totally_ordered<insertion_ordered_set<Key>> {
+public:
+    using key_type = Key;
+    using value_type = Key;
+
+private:
+    using store_type = insertion_ordered_detail::element_store<Key, value_type>;
+    store_type store;
+
+public:
+    using const_iterator = typename store_type::const_iterator;
+    using iterator = typename store_type::iterator;
+
+    insertion_ordered_set() = default;
+
+    template<class Iter>
+        insertion_ordered_set(Iter it, Iter it_end) {
+        insert(it, it_end);
+    }
+
+    explicit insertion_ordered_set(std::initializer_list<value_type> init) {
+        insert(init.begin(), init.end());
+    }
+
+    const_iterator begin() const { return store.begin(); }
+    const_iterator end() const { return store.end(); }
+
+    const_iterator find(const Key &key) const {
+        return store.find(key);
+    }
+
+    std::pair<iterator, bool> insert(const Key &key) {
+        return store.insert(key, key);
+    }
+
+    template<class Iter>
+    void insert(Iter it, Iter it_end) {
+        for (; it != it_end; ++it) {
+            insert(*it);
+        }
+    }
+
+    bool empty() const {
+        return store.empty();
+    }
+
+    size_t size() const {
+        return store.size();
+    }
+
+    void clear() {
+        store.clear();
+    }
+
+    void reserve(size_t n) {
+        store.reserve(n);
+    }
+
+    bool operator==(const insertion_ordered_set &a) const {
+        return store == a.store;
+    }
+
+    bool operator<(const insertion_ordered_set &a) const {
+        return store < a.store;
+    }
+
+    void swap(insertion_ordered_set &a) {
+        store.swap(a.store);
+    }
+
+    friend void swap(insertion_ordered_set &a, insertion_ordered_set &b) {
+        a.swap(b);
+    }
+};
+
+} // namespace ue2
+
+#endif // UTIL_INSERTION_ORDERED_H
diff --git a/src/util/multibit_build.h b/src/util/multibit_build.h
index 2d7b5fc2..ba5c8dfa 100644
--- a/src/util/multibit_build.h
+++ b/src/util/multibit_build.h
@@ -43,10 +43,16 @@ bool operator==(const mmbit_sparse_iter &a, const mmbit_sparse_iter &b) {
     return a.mask == b.mask && a.val == b.val;
 }
 
-inline
-size_t hash_value(const mmbit_sparse_iter &iter) {
-    return ue2::hash_all(iter.mask, iter.val);
-}
+namespace std {
+
+template<>
+struct hash<mmbit_sparse_iter> {
+    size_t operator()(const mmbit_sparse_iter &iter) const {
+        return ue2::hash_all(iter.mask, iter.val);
+    }
+};
+
+} // namespace std
 
 namespace ue2 {
 
diff --git a/src/util/multibit_compress.h b/src/util/multibit_compress.h
new file mode 100644
index 00000000..e7b4fd8e
--- /dev/null
+++ b/src/util/multibit_compress.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** file
+ * \brief multibit compression API: compress / decompress / size
+ */
+
+#ifndef MULTIBIT_COMPRESS_H
+#define MULTIBIT_COMPRESS_H
+
+#include "multibit.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \brief size API. */
+static really_inline
+size_t mmbit_compsize(const u8 *bits, u32 total_bits) {
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        return (ROUNDUP_N(total_bits, 8) / 8);
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(bits) == 0) {
+        return sizeof(MMB_TYPE);
+    }
+    // Deal with normal pyramid mmb.
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    u32 num_block = 0;
+    // Iteration-version of DFS
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
+                                  key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (mmb_popcount(block) == mmb_popcount(block_1)) {
+                num_block++;
+            }
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            return sizeof(MMB_TYPE) * num_block;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+}
+
+/** \brief compress API. */
+static really_inline
+char mmbit_compress(const u8 *bits, u32 total_bits, u8 *comp,
+                    size_t *comp_space, size_t max_comp_space) {
+    UNUSED u8 *comp_init = comp;
+    // Compute comp_size first.
+    size_t comp_size = mmbit_compsize(bits, total_bits);
+    // Check whether out of writable range.
+    if (comp_size > max_comp_space) {
+        return 0;
+    }
+    *comp_space = comp_size; // Return comp_size outside.
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        memcpy(comp, bits, comp_size);
+        return 1;
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(bits) == 0) {
+        memcpy(comp, bits, sizeof(MMB_TYPE));
+        return 1;
+    }
+    // Deal with normal pyramid mmb.
+    const u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    // Iteration-version of DFS
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
+                                  key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (mmb_popcount(block) == mmb_popcount(block_1)) {
+                memcpy(comp, &block, sizeof(MMB_TYPE));
+                comp += sizeof(MMB_TYPE);
+            }
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            break;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+    assert((u32)(comp - comp_init) == comp_size);
+    return 1;
+}
+
+/** \brief decompress API. */
+static really_inline
+char mmbit_decompress(u8 *bits, u32 total_bits, const u8 *comp,
+                      size_t *comp_space, size_t max_comp_space) {
+    UNUSED const u8 *comp_init = comp;
+    size_t comp_size;
+    // Deal with flat model.
+    if (total_bits <= MMB_FLAT_MAX_BITS) {
+        comp_size = ROUNDUP_N(total_bits, 8) / 8;
+        memcpy(bits, comp, comp_size);
+        *comp_space = comp_size;
+        return 1;
+    }
+    // Deal with all cleared mmb.
+    if (mmb_load(comp) == 0) {
+        comp_size = sizeof(MMB_TYPE);
+        memcpy(bits, comp, comp_size);
+        *comp_space = comp_size;
+        return 1;
+    }
+    // Deal with normal mmb.
+    u32 max_level = mmbit_maxlevel(total_bits);
+    u32 level = 0;
+    u32 key = 0;
+    u32 key_rem = 0;
+    UNUSED const u8 *comp_end = comp_init + max_comp_space;
+    // Iteration-version of DFS
+    memcpy(bits, comp, sizeof(MMB_TYPE)); // Copy root block first.
+    comp += sizeof(MMB_TYPE);
+    while (1) {
+        if (key_rem < MMB_KEY_BITS) {
+            u8 *block_ptr = mmbit_get_level_root(bits, level) +
+                            key * sizeof(MMB_TYPE);
+            MMB_TYPE block = mmb_load(block_ptr);
+            MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem);
+            if (level < max_level && block_1) {
+                key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1);
+                u8 *block_ptr_1 = mmbit_get_level_root(bits, level + 1) +
+                                  key * sizeof(MMB_TYPE);
+                memcpy(block_ptr_1, comp, sizeof(MMB_TYPE));
+                comp += sizeof(MMB_TYPE);
+		        if (comp > comp_end) {
+                    return 0; // Out of buffer.
+                }
+                key_rem = 0;
+                level++;
+                continue;
+            }
+        }
+        if (level-- == 0) {
+            break;
+        }
+        key_rem = (key & MMB_KEY_MASK) + 1;
+        key >>= MMB_KEY_SHIFT;
+    }
+    comp_size = (u32)(comp - comp_init);
+    *comp_space = comp_size;
+    return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // MULTBIT_COMPRESS_H
+
diff --git a/src/util/partitioned_set.h b/src/util/partitioned_set.h
index a9e4644d..8a4d3dd9 100644
--- a/src/util/partitioned_set.h
+++ b/src/util/partitioned_set.h
@@ -31,7 +31,7 @@
 
 #include "container.h"
 #include "noncopyable.h"
-#include "ue2_containers.h"
+#include "flat_containers.h"
 #include "ue2common.h"
 
 #include <algorithm>
@@ -98,8 +98,7 @@ public:
      * If the set was not split (due to there being no overlap with splitter or
      * being a complete subset), INVALID_SUBSET is returned.
      */
-    size_t split(size_t subset_index,
-                 const typename ue2::flat_set<T> &splitter) {
+    size_t split(size_t subset_index, const flat_set<T> &splitter) {
         assert(!splitter.empty());
         if (splitter.empty()) {
             return INVALID_SUBSET;
@@ -129,12 +128,10 @@ public:
         }
 
         for (auto it = orig.members.begin(); it != orig.members.end(); ++it) {
-            T member = *it;
+            const auto &member = *it;
             assert(member < member_to_subset.size());
 
-            while (sp_it != sp_e && *sp_it < member) {
-                ++sp_it;
-            }
+            sp_it = std::lower_bound(sp_it, sp_e, member);
             if (sp_it == sp_e) {
                 split_temp_diff.insert(split_temp_diff.end(), it,
                                        orig.members.end());
@@ -193,7 +190,7 @@ public:
     /**
      * Returns all subsets which have a member in keys.
      */
-    void find_overlapping(const typename ue2::flat_set<T> &keys,
+    void find_overlapping(const flat_set<T> &keys,
                           std::vector<size_t> *containing) const {
         boost::dynamic_bitset<> seen(subsets.size()); // all zero by default.
 
diff --git a/src/util/report.h b/src/util/report.h
index a8e233ff..0d5e69b8 100644
--- a/src/util/report.h
+++ b/src/util/report.h
@@ -206,13 +206,6 @@ bool operator==(const Report &a, const Report &b) {
            a.topSquashDistance == b.topSquashDistance;
 }
 
-inline
-size_t hash_value(const Report &r) {
-    return hash_all(r.type, r.quashSom, r.minOffset, r.maxOffset, r.minLength,
-                    r.ekey, r.offsetAdjust, r.onmatch, r.revNfaIndex,
-                    r.somDistance, r.topSquashDistance);
-}
-
 static inline
 Report makeECallback(u32 report, s32 offsetAdjust, u32 ekey) {
     Report ir(EXTERNAL_CALLBACK, report);
@@ -262,6 +255,19 @@ bool isSimpleExhaustible(const Report &ir) {
     return true;
 }
 
-} // namespace
+} // namespace ue2
+
+namespace std {
+
+template<>
+struct hash<ue2::Report> {
+    std::size_t operator()(const ue2::Report &r) const {
+        return ue2::hash_all(r.type, r.quashSom, r.minOffset, r.maxOffset,
+                             r.minLength, r.ekey, r.offsetAdjust, r.onmatch,
+                             r.revNfaIndex, r.somDistance, r.topSquashDistance);
+    }
+};
+
+} // namespace std
 
 #endif // UTIL_REPORT_H
diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp
index a846eb25..c0e9ee15 100644
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@@ -133,7 +133,7 @@ vector<ReportID> ReportManager::getDkeyToReportTable() const {
 void ReportManager::assignDkeys(const RoseBuild *rose) {
     DEBUG_PRINTF("assigning...\n");
 
-    map<u32, ue2::flat_set<ReportID>> ext_to_int;
+    map<u32, flat_set<ReportID>> ext_to_int;
 
     for (u32 i = 0; i < reportIds.size(); i++) {
         const Report &ir = reportIds[i];
diff --git a/src/util/report_manager.h b/src/util/report_manager.h
index 95e14a2c..aa359ed7 100644
--- a/src/util/report_manager.h
+++ b/src/util/report_manager.h
@@ -38,10 +38,10 @@
 #include "util/compile_error.h"
 #include "util/noncopyable.h"
 #include "util/report.h"
-#include "util/ue2_containers.h"
 
 #include <map>
 #include <set>
+#include <unordered_map>
 #include <vector>
 
 namespace ue2 {
@@ -131,17 +131,17 @@ private:
 
     /** \brief Mapping from Report to ID (inverse of \ref reportIds
      * vector). */
-    unordered_map<Report, size_t> reportIdToInternalMap;
+    std::unordered_map<Report, size_t> reportIdToInternalMap;
 
     /** \brief Mapping from ReportID to dedupe key. */
-    unordered_map<ReportID, u32> reportIdToDedupeKey;
+    std::unordered_map<ReportID, u32> reportIdToDedupeKey;
 
     /** \brief Mapping from ReportID to Rose program offset in bytecode. */
-    unordered_map<ReportID, u32> reportIdToProgramOffset;
+    std::unordered_map<ReportID, u32> reportIdToProgramOffset;
 
     /** \brief Mapping from external match ids to information about that
      * id. */
-    unordered_map<ReportID, external_report_info> externalIdMap;
+    std::unordered_map<ReportID, external_report_info> externalIdMap;
 
     /** \brief Mapping from expression index to exhaustion key. */
     std::map<s64a, u32> toExhaustibleKeyMap;
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 047cdbab..c1449711 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -169,16 +169,24 @@ m128 load_m128_from_u64a(const u64a *p) {
 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
 
+#if defined(HAVE_SSE41)
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+#else
+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+#endif
+
 #if !defined(HAVE_AVX2)
 // TODO: this entire file needs restructuring - this carveout is awful
 #define extractlow64from256(a) movq(a.lo)
 #define extractlow32from256(a) movd(a.lo)
 #if defined(HAVE_SSE41)
 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
 #else
-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
 #endif
 
 #endif // !AVX2
@@ -741,8 +749,8 @@ m128 movdq_lo(m256 x) {
 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
 #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
 #define extractlow32from256(a) movd(cast256to128(a))
-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b);
-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b);
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
 #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
 
 static really_inline
@@ -755,6 +763,15 @@ m256 combine2x128(m128 hi, m128 lo) {
 }
 #endif //AVX2
 
+#if defined(HAVE_AVX512)
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+#endif
+
 /****
  **** 384-bit Primitives
  ****/
@@ -969,6 +986,19 @@ m512 set8x64(u64a a) {
     return _mm512_set1_epi64(a);
 }
 
+static really_inline
+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+                            lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}
+
 static really_inline
 m512 set4x128(m128 a) {
     return _mm512_broadcast_i32x4(a);
@@ -1059,6 +1089,7 @@ m512 lshift64_m512(m512 a, unsigned b) {
 #if defined(HAVE_AVX512)
 #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
 #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
 #endif
 
 #if !defined(_MM_CMPINT_NE)
@@ -1169,6 +1200,11 @@ static really_inline
 m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
     return _mm512_mask_loadu_epi8(src, k, ptr);
 }
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+    return _mm512_movm_epi8(k);
+}
 #endif
 
 // packed unaligned store of first N bytes
diff --git a/src/util/small_vector.h b/src/util/small_vector.h
index 6293759c..0f54bbf6 100644
--- a/src/util/small_vector.h
+++ b/src/util/small_vector.h
@@ -33,7 +33,12 @@
 
 #include <boost/version.hpp>
 
-#if BOOST_VERSION >= 105800
+/*
+ * We use the small_vector constructors introduced in Boost 1.61 (trac bug
+ * #11866, github commit b436c91). If the Boost version is too old, we fall
+ * back to using std::vector.
+ */
+#if BOOST_VERSION >= 106100
 #  define HAVE_BOOST_CONTAINER_SMALL_VECTOR
 #endif
 
diff --git a/src/util/ue2_graph.h b/src/util/ue2_graph.h
index 138d7467..bf719fd7 100644
--- a/src/util/ue2_graph.h
+++ b/src/util/ue2_graph.h
@@ -34,7 +34,6 @@
 #include "util/noncopyable.h"
 #include "util/operators.h"
 
-#include <boost/functional/hash.hpp>
 #include <boost/graph/properties.hpp> /* vertex_index_t, ... */
 #include <boost/pending/property.hpp> /* no_property */
 #include <boost/property_map/property_map.hpp>
@@ -42,7 +41,9 @@
 #include <boost/iterator/iterator_adaptor.hpp>
 #include <boost/iterator/iterator_facade.hpp>
 
+#include <functional> /* hash */
 #include <tuple> /* tie */
+#include <type_traits> /* is_same, etc */
 #include <utility> /* pair, declval */
 
 /*
@@ -168,7 +169,76 @@ struct default_vertex_property {
     size_t index;
 };
 
-}
+template<typename Graph>
+class vertex_descriptor : totally_ordered<vertex_descriptor<Graph>> {
+    using vertex_node = typename Graph::vertex_node;
+public:
+    vertex_descriptor() : p(nullptr), serial(0) {}
+    explicit vertex_descriptor(vertex_node *pp) : p(pp), serial(pp->serial) {}
+
+    operator bool() const { return p; }
+    bool operator<(const vertex_descriptor b) const {
+        if (p && b.p) {
+            /* no vertices in the same graph can have the same serial */
+            assert(p == b.p || serial != b.serial);
+            return serial < b.serial;
+        } else {
+            return p < b.p;
+        }
+    }
+    bool operator==(const vertex_descriptor b) const { return p == b.p; }
+
+    size_t hash() const {
+        return std::hash<u64a>()(serial);
+    }
+
+private:
+    vertex_node *raw(void) { return p; }
+    vertex_node *p;
+    u64a serial;
+    friend Graph;
+};
+
+template<typename Graph>
+class edge_descriptor : totally_ordered<edge_descriptor<Graph>> {
+    using edge_node = typename Graph::edge_node;
+public:
+    edge_descriptor() : p(nullptr), serial(0) {}
+    explicit edge_descriptor(edge_node *pp) : p(pp), serial(pp->serial) {}
+
+    /* Convenience ctor to allow us to directly get an edge_descriptor from
+     * edge() and add_edge(). As we have null_edges and we always allow
+     * parallel edges, the bool component of the return from these functions is
+     * not required. */
+    edge_descriptor(const std::pair<edge_descriptor, bool> &tup)
+        : p(tup.first.p), serial(tup.first.serial) {
+        assert(tup.second == (bool)tup.first);
+    }
+
+    operator bool() const { return p; }
+    bool operator<(const edge_descriptor b) const {
+        if (p && b.p) {
+            /* no edges in the same graph can have the same serial */
+            assert(p == b.p || serial != b.serial);
+            return serial < b.serial;
+        } else {
+            return p < b.p;
+        }
+    }
+    bool operator==(const edge_descriptor b) const { return p == b.p; }
+
+    size_t hash() const {
+        return std::hash<u64a>()(serial);
+    }
+
+private:
+    edge_node *raw(void) { return p; }
+    edge_node *p;
+    u64a serial;
+    friend Graph;
+};
+
+} // namespace graph_detail
 
 template<typename Graph,
          typename VertexPropertyType = graph_detail::default_vertex_property,
@@ -281,6 +351,11 @@ private:
         return serial;
     }
 public:
+    using vertex_descriptor = graph_detail::vertex_descriptor<ue2_graph>;
+    using edge_descriptor = graph_detail::edge_descriptor<ue2_graph>;
+    friend vertex_descriptor;
+    friend edge_descriptor;
+
     using vertices_size_type = typename vertices_list_type::size_type;
     using degree_size_type
         = typename vertex_edge_list<out_edge_hook>::size_type;
@@ -293,78 +368,6 @@ public:
     using vertex_bundled = VertexPropertyType;
     using edge_bundled = EdgePropertyType;
 
-    class vertex_descriptor : totally_ordered<vertex_descriptor> {
-    public:
-        vertex_descriptor() : p(nullptr), serial(0) { }
-        explicit vertex_descriptor(vertex_node *pp)
-            : p(pp), serial(pp->serial) { }
-
-        operator bool() const { return p; }
-        bool operator<(const vertex_descriptor b) const {
-            if (p && b.p) {
-                 /* no vertices in the same graph can have the same serial */
-                assert(p == b.p || serial != b.serial);
-                return serial < b.serial;
-            } else {
-                return p < b.p;
-            }
-        }
-        bool operator==(const vertex_descriptor b) const {
-            return p == b.p;
-        }
-
-        friend size_t hash_value(vertex_descriptor v) {
-            using boost::hash_value;
-            return hash_value(v.serial);
-        }
-
-    private:
-        vertex_node *raw(void) { return p; }
-        vertex_node *p;
-        u64a serial;
-        friend ue2_graph;
-    };
-
-    class edge_descriptor : totally_ordered<edge_descriptor> {
-    public:
-        edge_descriptor() : p(nullptr), serial(0) { }
-        explicit edge_descriptor(edge_node *pp) : p(pp), serial(pp->serial) { }
-
-        /* Convenice ctor to allow us to directly get an edge_descriptor from
-         * edge() and add_edge(). As we have null_edges and we always allow
-         * parallel edges, the bool component of the return from these functions
-         * is not required. */
-        edge_descriptor(const std::pair<edge_descriptor, bool> &tup)
-            : p(tup.first.p), serial(tup.first.serial) {
-            assert(tup.second == (bool)tup.first);
-        }
-
-        operator bool() const { return p; }
-        bool operator<(const edge_descriptor b) const {
-            if (p && b.p) {
-                 /* no edges in the same graph can have the same serial */
-                assert(p == b.p || serial != b.serial);
-                return serial < b.serial;
-            } else {
-                return p < b.p;
-            }
-        }
-        bool operator==(const edge_descriptor b) const {
-            return p == b.p;
-        }
-
-        friend size_t hash_value(edge_descriptor e) {
-            using boost::hash_value;
-            return hash_value(e.serial);
-        }
-
-    private:
-        edge_node *raw(void) { return p; }
-        edge_node *p;
-        u64a serial;
-        friend ue2_graph;
-    };
-
 private:
     /* Note: apparently, nested class templates cannot be fully specialised but
      * they can be partially specialised. Sigh, ... */
@@ -1284,7 +1287,7 @@ edge_index_upper_bound(const Graph &g) {
 using boost::vertex_index;
 using boost::edge_index;
 
-}
+} // namespace ue2
 
 namespace boost {
 
@@ -1301,5 +1304,29 @@ struct property_map<Graph, Prop,
                          std::declval<const Graph &>())) const_type;
 };
 
-}
+} // namespace boost
+
+namespace std {
+
+/* Specialization of std::hash so that vertex_descriptor can be used in
+ * unordered containers. */
+template<typename Graph>
+struct hash<ue2::graph_detail::vertex_descriptor<Graph>> {
+    using vertex_descriptor = ue2::graph_detail::vertex_descriptor<Graph>;
+    std::size_t operator()(const vertex_descriptor &v) const {
+        return v.hash();
+    }
+};
+
+/* Specialization of std::hash so that edge_descriptor can be used in
+ * unordered containers. */
+template<typename Graph>
+struct hash<ue2::graph_detail::edge_descriptor<Graph>> {
+    using edge_descriptor = ue2::graph_detail::edge_descriptor<Graph>;
+    std::size_t operator()(const edge_descriptor &e) const {
+        return e.hash();
+    }
+};
+
+} // namespace std
 #endif
diff --git a/src/util/ue2string.cpp b/src/util/ue2string.cpp
index 7c16aa58..98b007d4 100644
--- a/src/util/ue2string.cpp
+++ b/src/util/ue2string.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,11 +29,15 @@
 /** \file
  * \brief Tools for string manipulation, ue2_literal definition.
  */
-#include "charreach.h"
-#include "compare.h"
+
 #include "ue2string.h"
 
+#include "charreach.h"
+#include "compare.h"
+#include "hash_dynamic_bitset.h"
+
 #include <algorithm>
+#include <cstring>
 #include <iomanip>
 #include <sstream>
 #include <string>
@@ -233,13 +237,15 @@ ue2_literal::elem::operator CharReach () const {
     }
 }
 
+const ue2_literal::size_type ue2_literal::npos = std::string::npos;
+
 ue2_literal::ue2_literal(const std::string &s_in, bool nc_in)
-    : s(nc_in ? toUpperString(s_in) : s_in), nocase(s_in.size(), nc_in) {
+    : s(nc_in ? toUpperString(s_in) : s_in), nocase(s_in.size()) {
     if (nc_in) {
-        // Quash nocase bit for non-alpha chars
+        // Switch on nocase bit for all alpha characters.
         for (size_t i = 0; i < s.length(); i++) {
-            if (!ourisalpha(s[i])) {
-                nocase[i] = false;
+            if (ourisalpha(s[i])) {
+                nocase.set(i);
             }
         }
     }
@@ -252,21 +258,27 @@ ue2_literal ue2_literal::substr(size_type pos, size_type n) const {
     ue2_literal rv;
     rv.s = s.substr(pos, n);
     size_type upper = nocase.size();
-    if (n != string::npos && n + pos < nocase.size()) {
+    if (n != npos && n + pos < nocase.size()) {
         upper = n + pos;
     }
-    rv.nocase.insert(rv.nocase.end(), nocase.begin() + pos,
-                     nocase.begin() + upper);
+
+    rv.nocase.resize(upper - pos, false);
+    for (size_t i = pos; i < upper; i++) {
+        rv.nocase.set(i - pos, nocase.test(i));
+    }
+    assert(s.size() == nocase.size());
     return rv;
 }
 
 ue2_literal &ue2_literal::erase(size_type pos, size_type n) {
     s.erase(pos, n);
-    size_type upper = nocase.size();
-    if (n != string::npos && n + pos < nocase.size()) {
-        upper = n + pos;
+
+    if (n != npos) {
+        for (size_type i = pos + n; i < nocase.size(); i++) {
+            nocase.set(i - n, nocase.test(i));
+        }
     }
-    nocase.erase(nocase.begin() + pos, nocase.begin() + upper);
+    nocase.resize(s.size());
     return *this;
 }
 
@@ -279,18 +291,24 @@ void ue2_literal::push_back(char c, bool nc) {
     s.push_back(c);
 }
 
+void ue2_literal::reverse() {
+    std::reverse(s.begin(), s.end());
+
+    const size_t len = nocase.size();
+    for (size_t i = 0; i < len / 2; i++) {
+        size_t j = len - i - 1;
+        bool a = nocase.test(i);
+        bool b = nocase.test(j);
+        nocase.set(i, b);
+        nocase.set(j, a);
+    }
+}
+
 // Return a copy of this literal in reverse order.
 ue2_literal reverse_literal(const ue2_literal &in) {
-    ue2_literal rv;
-    if (in.empty()) {
-        return rv;
-    }
-
-    for (ue2_literal::const_iterator it = in.end(); it != in.begin();) {
-        --it;
-        rv.push_back(it->c, it->nocase);
-    }
-    return rv;
+    auto out = in;
+    out.reverse();
+    return out;
 }
 
 bool ue2_literal::operator<(const ue2_literal &b) const {
@@ -303,46 +321,28 @@ bool ue2_literal::operator<(const ue2_literal &b) const {
     return nocase < b.nocase;
 }
 
-ue2_literal operator+(const ue2_literal &a, const ue2_literal &b) {
-    ue2_literal rv;
-    rv.s = a.s + b.s;
-    rv.nocase = a.nocase;
-    rv.nocase.insert(rv.nocase.end(), b.nocase.begin(), b.nocase.end());
-    return rv;
-}
-
 void ue2_literal::operator+=(const ue2_literal &b) {
     s += b.s;
-    nocase.insert(nocase.end(), b.nocase.begin(), b.nocase.end());
+    size_t prefix = nocase.size();
+    nocase.resize(prefix + b.nocase.size());
+    for (size_t i = 0; i < b.nocase.size(); i++) {
+        nocase.set(prefix + i, b.nocase[i]);
+    }
 }
 
 bool ue2_literal::any_nocase() const {
-    return find(nocase.begin(), nocase.end(), true) != nocase.end();
+    return nocase.any();
 }
 
-bool mixed_sensitivity(const ue2_literal &s) {
-    bool cs = false;
-    bool nc = false;
-    for (ue2_literal::const_iterator it = s.begin(); it != s.end(); ++it) {
-        if (!ourisalpha(it->c)) {
-            continue;
-        }
-        if (it->nocase) {
-            nc = true;
-        } else {
-            cs = true;
-        }
-    }
-
-    return cs && nc;
+size_t ue2_literal::hash() const {
+    return hash_all(s, hash_dynamic_bitset()(nocase));
 }
 
 void make_nocase(ue2_literal *lit) {
     ue2_literal rv;
 
-    for (ue2_literal::const_iterator it = lit->begin(); it != lit->end();
-         ++it) {
-        rv.push_back(it->c, ourisalpha(it->c));
+    for (const auto &elem: *lit) {
+        rv.push_back(elem.c, ourisalpha(elem.c));
     }
 
     lit->swap(rv);
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index a90d47a3..0fa76c3a 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -35,12 +35,15 @@
 
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/compare.h"
 #include "util/hash.h"
+#include "util/operators.h"
 
 #include <iterator>
 #include <string>
 #include <vector>
 
+#include <boost/dynamic_bitset.hpp>
 #include <boost/iterator/iterator_facade.hpp>
 
 namespace ue2 {
@@ -79,7 +82,7 @@ struct ue2_case_string {
     bool nocase;
 };
 
-struct ue2_literal {
+struct ue2_literal : totally_ordered<ue2_literal> {
 public:
     /// Single element proxy, pointed to by our const_iterator.
     struct elem {
@@ -107,38 +110,38 @@ public:
     private:
         friend class boost::iterator_core_access;
         void increment() {
-            ++it; ++it_nc;
+            ++idx;
         }
         void decrement() {
-            --it; --it_nc;
+            --idx;
         }
         void advance(size_t n) {
-            it += n; it_nc += n;
+            idx += n;
         }
         difference_type distance_to(const const_iterator &other) const {
-            return other.it - it;
+            return other.idx - idx;
         }
         bool equal(const const_iterator &other) const {
-            return it == other.it;
+            return idx == other.idx && lit == other.lit;
         }
         const elem dereference() const {
-            return elem(*it, *it_nc);
+            return elem(lit->s[idx], lit->nocase[idx]);
         }
 
         friend struct ue2_literal;
-        const_iterator(const std::string::const_iterator &it_in,
-                       const std::vector<bool>::const_iterator &it_nc_in)
-            : it(it_in), it_nc(it_nc_in) {}
+        const_iterator(const ue2_literal &lit_in, size_t idx_in)
+            : lit(&lit_in), idx(idx_in) {}
 
-        std::string::const_iterator it;
-        std::vector<bool>::const_iterator it_nc;
+        const ue2_literal *lit = nullptr;
+        size_t idx;
     };
 
     using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    using size_type = std::string::size_type;
 
-    typedef std::string::size_type size_type;
+    static const size_type npos;
 
-    ue2_literal() {}
+    ue2_literal() = default;
     ue2_literal(const std::string &s_in, bool nc_in);
     ue2_literal(char c, bool nc_in);
     ue2_literal(const ue2_literal &) = default;
@@ -155,16 +158,16 @@ public:
 
     size_type length() const { return s.length(); }
     bool empty() const { return s.empty(); }
-    ue2_literal substr(size_type pos, size_type n = std::string::npos) const;
+    ue2_literal substr(size_type pos, size_type n = npos) const;
     const char *c_str() const { return s.c_str(); }
     bool any_nocase() const;
 
     const_iterator begin() const {
-        return const_iterator(s.begin(), nocase.begin());
+        return const_iterator(*this, 0);
     }
 
     const_iterator end() const {
-        return const_iterator(s.end(), nocase.end());
+        return const_iterator(*this, s.size());
     }
 
     const_reverse_iterator rbegin() const {
@@ -175,22 +178,26 @@ public:
         return const_reverse_iterator(begin());
     }
 
-    ue2_literal &erase(size_type pos = 0, size_type n = std::string::npos);
+    ue2_literal &erase(size_type pos = 0, size_type n = npos);
     void push_back(const elem &e) {
         push_back(e.c, e.nocase);
     }
 
     void push_back(char c, bool nc);
-    const elem back() const { return elem(*s.rbegin(), nocase.back()); }
-    friend ue2_literal operator+(const ue2_literal &a, const ue2_literal &b);
+    const elem back() const { return *rbegin(); }
+
+    friend ue2_literal operator+(ue2_literal a, const ue2_literal &b) {
+        a += b;
+        return a;
+    }
+
+    /// Reverse this literal in-place.
+    void reverse();
 
     void operator+=(const ue2_literal &b);
     bool operator==(const ue2_literal &b) const {
         return s == b.s && nocase == b.nocase;
     }
-    bool operator!=(const ue2_literal &b) const {
-        return !(*this == b);
-    }
     bool operator<(const ue2_literal &b) const;
 
     void clear(void) { s.clear(); nocase.clear(); }
@@ -202,19 +209,14 @@ public:
         nocase.swap(other.nocase);
     }
 
+    size_t hash() const;
+
 private:
+    friend const_iterator;
     std::string s;
-    std::vector<bool> nocase; /* for trolling value */
+    boost::dynamic_bitset<> nocase;
 };
 
-inline
-size_t hash_value(const ue2_literal::elem &elem) {
-    return hash_all(elem.c, elem.nocase);
-}
-
-inline
-size_t hash_value(const ue2_literal &lit) { return hash_range(lit); }
-
 /// Return a reversed copy of this literal.
 ue2_literal reverse_literal(const ue2_literal &in);
 
@@ -226,9 +228,36 @@ size_t maxStringSelfOverlap(const ue2_literal &a);
 size_t minStringPeriod(const ue2_literal &a);
 size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
 
-/** \brief True iff the literal cannot be considered entirely case-sensitive
- * nor entirely case-insensitive */
-bool mixed_sensitivity(const ue2_literal &lit);
+/**
+ * \brief True iff the range of a literal given cannot be considered entirely
+ * case-sensitive nor entirely case-insensitive.
+ */
+template<class Iter>
+bool mixed_sensitivity_in(Iter begin, Iter end) {
+    bool cs = false;
+    bool nc = false;
+    for (auto it = begin; it != end; ++it) {
+        if (!ourisalpha(it->c)) {
+            continue;
+        }
+        if (it->nocase) {
+            nc = true;
+        } else {
+            cs = true;
+        }
+    }
+
+    return cs && nc;
+}
+
+/**
+ * \brief True iff the literal cannot be considered entirely case-sensitive
+ * nor entirely case-insensitive.
+ */
+inline
+bool mixed_sensitivity(const ue2_literal &s) {
+    return mixed_sensitivity_in(s.begin(), s.end());
+}
 
 void make_nocase(ue2_literal *lit);
 
@@ -286,4 +315,22 @@ std::string escapeString(const ue2_literal &lit);
 
 } // namespace ue2
 
+namespace std {
+
+template<>
+struct hash<ue2::ue2_literal::elem> {
+    size_t operator()(const ue2::ue2_literal::elem &elem) const {
+        return ue2::hash_all(elem.c, elem.nocase);
+    }
+};
+
+template<>
+struct hash<ue2::ue2_literal> {
+    size_t operator()(const ue2::ue2_literal &lit) const {
+        return lit.hash();
+    }
+};
+
+} // namespace std
+
 #endif
diff --git a/src/util/unordered.h b/src/util/unordered.h
new file mode 100644
index 00000000..a8aa61cd
--- /dev/null
+++ b/src/util/unordered.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_UNORDERED_H
+#define UTIL_UNORDERED_H
+
+/**
+ * \file
+ * \brief Unordered set and map containers that default to using our own hasher.
+ */
+
+#include "hash.h"
+
+#include <unordered_set>
+#include <unordered_map>
+
+namespace ue2 {
+
+template<class Key, class Hash = ue2_hasher>
+using ue2_unordered_set = std::unordered_set<Key, Hash>;
+
+template<class Key, class T, class Hash = ue2_hasher>
+using ue2_unordered_map = std::unordered_map<Key, T, Hash>;
+
+} // namespace ue2
+
+
+#endif // UTIL_UNORDERED_H
diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt
index 9b2cde4d..a8792cf7 100644
--- a/tools/hsbench/CMakeLists.txt
+++ b/tools/hsbench/CMakeLists.txt
@@ -4,12 +4,7 @@ if (NOT SQLITE3_FOUND)
     return()
 endif()
 
-if (NOT XCODE)
-    include_directories(SYSTEM ${SQLITE3_INCLUDE_DIRS})
-else()
-    # cmake doesn't think Xcode supports isystem
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${SQLITE3_INCLUDE_DIRS}")
-endif()
+include_directories(SYSTEM ${SQLITE3_INCLUDE_DIRS})
 
 # BSD has the _np funcs in a _np header
 CHECK_INCLUDE_FILE_CXX(pthread_np.h HAVE_PTHREAD_NP_H)
diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h
index efff3f99..a8295911 100644
--- a/tools/hsbench/common.h
+++ b/tools/hsbench/common.h
@@ -40,5 +40,6 @@ extern std::string serializePath;
 extern unsigned int somPrecisionMode;
 extern bool forceEditDistance;
 extern unsigned editDistance;
+extern bool printCompressSize;
 
 #endif // COMMON_H
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 9674e5c8..5f188472 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -205,6 +205,35 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
     }
 }
 
+void EngineHyperscan::streamCompressExpand(EngineStream &stream,
+                                           vector<char> &temp) const {
+    size_t used = 0;
+    hs_error_t err = hs_compress_stream(stream.id, temp.data(), temp.size(),
+                                        &used);
+    if (err == HS_INSUFFICIENT_SPACE) {
+        temp.resize(used);
+        err = hs_compress_stream(stream.id, temp.data(), temp.size(), &used);
+    }
+
+    if (err != HS_SUCCESS) {
+        printf("Fatal error: hs_compress_stream returned error %d\n", err);
+        abort();
+    }
+
+    if (printCompressSize) {
+        printf("stream %u: compressed to %zu\n", stream.sn, used);
+    }
+
+    err = hs_reset_and_expand_stream(stream.id, temp.data(), temp.size(),
+                                     nullptr, nullptr, nullptr);
+
+    if (err != HS_SUCCESS) {
+        printf("Fatal error: hs_reset_and expand_stream returned error %d\n",
+               err);
+        abort();
+    }
+}
+
 static
 unsigned makeModeFlags(ScanMode scan_mode) {
     switch (scan_mode) {
diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h
index 7875decc..2c93959b 100644
--- a/tools/hsbench/engine_hyperscan.h
+++ b/tools/hsbench/engine_hyperscan.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "hs_runtime.h"
 
 #include <memory>
+#include <vector>
 
 /** Structure for the result of a single complete scan. */
 struct ResultEntry {
@@ -79,6 +80,9 @@ public:
     void streamClose(std::unique_ptr<EngineStream> stream,
                      ResultEntry &result) const;
 
+    void streamCompressExpand(EngineStream &stream,
+                              std::vector<char> &temp) const;
+
     void streamScan(EngineStream &stream, const char *data, unsigned int len,
                     unsigned int id, ResultEntry &result) const;
 
diff --git a/tools/hsbench/heapstats.cpp b/tools/hsbench/heapstats.cpp
index d0dffdb3..5fba7c2a 100644
--- a/tools/hsbench/heapstats.cpp
+++ b/tools/hsbench/heapstats.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,18 +47,21 @@
 #include <malloc.h>
 
 size_t getPeakHeap(void) {
-    FILE *tmpf = tmpfile();
-    if (!tmpf) {
+    size_t fsize;
+    char *fptr;
+    FILE *fstr = open_memstream(&fptr, &fsize);
+    if (!fstr) {
         return 0;
     }
 
-    int rv = malloc_info(0, tmpf);
+    int rv = malloc_info(0, fstr);
     if (rv != 0) {
-        fclose(tmpf);
+        fclose(fstr);
+        free(fptr);
         return 0;
     }
 
-    rewind(tmpf);
+    rewind(fstr);
 
     // We don't want to depend on a real XML parser. This is ugly and brittle
     // and hopefully good enough for the time being. We look for the last
@@ -71,7 +74,7 @@ size_t getPeakHeap(void) {
     size_t len = 0, maxheap = 0;
     ssize_t read;
 
-    while ((read = getline(&line, &len, tmpf)) != -1) {
+    while ((read = getline(&line, &len, fstr)) != -1) {
         if (strncmp(line, begin, begin_len) == 0) {
             errno = 0;
             maxheap = (size_t)strtoull(line + begin_len, nullptr, 10);
@@ -83,7 +86,8 @@ size_t getPeakHeap(void) {
 
 finish:
     free(line);
-    fclose(tmpf);
+    fclose(fstr);
+    free(fptr);
     return maxheap;
 }
 
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 9c5fd6cb..f2ea8e7e 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -77,10 +77,13 @@ string serializePath("");
 unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
 bool forceEditDistance = false;
 unsigned editDistance = 0;
+bool printCompressSize = false;
+
+// Globals local to this file.
+static bool compressStream = false;
 
 namespace /* anonymous */ {
 
-// Globals local to this file.
 bool display_per_scan = false;
 ScanMode scan_mode = ScanMode::STREAMING;
 unsigned repeats = 20;
@@ -212,11 +215,15 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
     int in_sigfile = 0;
     int do_per_scan = 0;
     int do_echo_matches = 0;
+    int do_compress = 0;
+    int do_compress_size = 0;
     vector<string> sigFiles;
 
     static struct option longopts[] = {
         {"per-scan", 0, &do_per_scan, 1},
         {"echo-matches", 0, &do_echo_matches, 1},
+        {"compress-stream", 0, &do_compress, 1},
+        {"print-compress-size", 0, &do_compress_size, 1},
         {nullptr, 0, nullptr, 0}
     };
 
@@ -338,6 +345,12 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
     if (do_per_scan) {
         display_per_scan = true;
     }
+    if (do_compress) {
+        compressStream = true;
+    }
+    if (do_compress_size) {
+        printCompressSize = true;
+    }
 
     if (exprPath.empty() && !sigFiles.empty()) {
         /* attempt to infer an expression directory */
@@ -470,10 +483,12 @@ vector<StreamInfo> prepStreamingData(const ThreadContext *ctx) {
 }
 
 static
-void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams) {
+void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams,
+                            bool do_compress) {
     assert(ctx);
     const EngineHyperscan &e = ctx->engine;
     const vector<DataBlock> &blocks = ctx->corpus_data;
+    vector<char> compress_buf(do_compress ? 1000 : 0);
 
     for (ResultEntry &r : ctx->results) {
         ctx->timer.start();
@@ -491,6 +506,8 @@ void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams) {
                     printf("Fatal error: stream open failed!\n");
                     exit(1);
                 }
+            } else if (do_compress) {
+                e.streamCompressExpand(*stream.eng_handle, compress_buf);
             }
 
             assert(stream.eng_handle);
@@ -521,7 +538,7 @@ void benchStreaming(void *context) {
 
     startTotalTimer(ctx);
 
-    benchStreamingInternal(ctx, streams);
+    benchStreamingInternal(ctx, streams, false);
 
     // Synchronization point
     ctx->barrier();
@@ -530,6 +547,26 @@ void benchStreaming(void *context) {
     stopTotalTimer(ctx);
 }
 
+static
+void benchStreamingCompress(void *context) {
+    ThreadContext *ctx = (ThreadContext *)context;
+    vector<StreamInfo> streams = prepStreamingData(ctx);
+
+    // Synchronization point
+    ctx->barrier();
+
+    startTotalTimer(ctx);
+
+    benchStreamingInternal(ctx, streams, true);
+
+    // Synchronization point
+    ctx->barrier();
+
+    // Now that all threads are finished, we can stop the clock.
+    stopTotalTimer(ctx);
+}
+
+
 /** In-memory structure for a data block to be scanned in vectored mode. */
 struct VectoredInfo {
     vector<const char *> data;
@@ -704,7 +741,11 @@ unique_ptr<ThreadContext> makeThreadContext(const EngineHyperscan &db,
     thread_func_t fn = nullptr;
     switch (scan_mode) {
     case ScanMode::STREAMING:
-        fn = benchStreaming;
+        if (compressStream) {
+            fn = benchStreamingCompress;
+        } else {
+            fn = benchStreaming;
+        }
         break;
     case ScanMode::VECTORED:
         fn = benchVectored;
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index a7658b26..06cddebd 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -2,12 +2,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 
 set(gtest_SOURCES gtest/gtest-all.cc gtest/gtest.h)
-if(NOT XCODE)
-    include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
-else()
-    set(CMAKE_CXX_FLAGS "-isystem ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CXX_FLAGS}")
-endif()
-
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR})
 
 # remove some warnings
@@ -25,6 +20,10 @@ if(CXX_WUNUSED_VARIABLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
 endif()
 
+if (CXX_UNUSED_LOCAL_TYPEDEFS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
+endif()
+
 if(CMAKE_COMPILER_IS_GNUCC)
     # spurious warnings?
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds")
@@ -42,6 +41,7 @@ set(unit_hyperscan_SOURCES
     hyperscan/expr_info.cpp
     hyperscan/extparam.cpp
     hyperscan/identical.cpp
+    hyperscan/literals.cpp
     hyperscan/main.cpp
     hyperscan/multi.cpp
     hyperscan/order.cpp
@@ -77,10 +77,12 @@ set(unit_internal_SOURCES
     internal/flat_set.cpp
     internal/flat_map.cpp
     internal/graph.cpp
+    internal/insertion_ordered.cpp
     internal/lbr.cpp
     internal/limex_nfa.cpp
     internal/masked_move.cpp
     internal/multi_bit.cpp
+    internal/multi_bit_compress.cpp
     internal/nfagraph_common.h
     internal/nfagraph_comp.cpp
     internal/nfagraph_equivalence.cpp
diff --git a/unit/hyperscan/arg_checks.cpp b/unit/hyperscan/arg_checks.cpp
index 8e86cc64..0ff4ce5f 100644
--- a/unit/hyperscan/arg_checks.cpp
+++ b/unit/hyperscan/arg_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -2318,6 +2318,289 @@ TEST(HyperscanArgChecks, hs_populate_platform_null) {
     ASSERT_EQ(HS_INVALID, err);
 }
 
+TEST(HyperscanArgChecks, CompressStreamNoStream) {
+    char buf[100];
+    size_t used;
+    hs_error_t err = hs_compress_stream(nullptr, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_INVALID, err);
+}
+
+TEST(HyperscanArgChecks, CompressStreamNoUsed) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream;
+    hs_error_t err = hs_open_stream(db, 0, &stream);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[100];
+    err = hs_compress_stream(stream, buf, sizeof(buf), nullptr);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, CompressStreamNoBuf) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream;
+    hs_error_t err = hs_open_stream(db, 0, &stream);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[100];
+    size_t used;
+    err = hs_compress_stream(stream, nullptr, sizeof(buf), &used);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, CompressStreamSmallBuff) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream;
+    hs_error_t err = hs_open_stream(db, 0, &stream);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[100];
+    size_t used = 0;
+    err = hs_compress_stream(stream, buf, 1, &used);
+    ASSERT_EQ(HS_INSUFFICIENT_SPACE, err);
+    ASSERT_LT(0, used);
+
+    err = hs_close_stream(stream, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ExpandNoDb) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_expand_stream(nullptr, &stream2, buf, used);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ExpandNoTo) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_expand_stream(db, nullptr, buf, used);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ExpandNoBuf) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_expand_stream(db, &stream2, nullptr, used);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ExpandSmallBuf) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_expand_stream(db, &stream2, buf, used / 2);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ResetAndExpandNoStream) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_reset_and_expand_stream(nullptr, buf, used, nullptr, nullptr,
+                                     nullptr);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ResetAndExpandNoBuf) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_open_stream(db, 0, &stream2);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_reset_and_expand_stream(stream2, nullptr, used, nullptr, nullptr,
+                                     nullptr);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_close_stream(stream2, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+
+TEST(HyperscanArgChecks, ResetAndExpandSmallBuf) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_open_stream(db, 0, &stream2);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_reset_and_expand_stream(stream2, buf, used / 2, nullptr, nullptr,
+                                     nullptr);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_close_stream(stream2, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST(HyperscanArgChecks, ResetAndExpandNoScratch) {
+    hs_database_t *db = buildDB("(foo.*bar){3,}", 0, 0, HS_MODE_STREAM);
+    ASSERT_NE(nullptr, db);
+
+    hs_stream_t *stream1;
+    hs_error_t err = hs_open_stream(db, 0, &stream1);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    char buf[2000];
+    size_t used = 0;
+    err = hs_compress_stream(stream1, buf, sizeof(buf), &used);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    hs_stream_t *stream2;
+    err = hs_open_stream(db, 0, &stream2);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    int temp;
+
+    err = hs_reset_and_expand_stream(stream2, buf, used, nullptr, singleHandler,
+                                     &temp);
+    ASSERT_EQ(HS_INVALID, err);
+
+    err = hs_close_stream(stream1, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_close_stream(stream2, nullptr, nullptr, nullptr);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    err = hs_free_database(db);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
 class BadModeTest : public testing::TestWithParam<unsigned> {};
 
 // hs_compile: Compile a pattern with bogus mode flags set.
diff --git a/unit/hyperscan/literals.cpp b/unit/hyperscan/literals.cpp
new file mode 100644
index 00000000..86bd317c
--- /dev/null
+++ b/unit/hyperscan/literals.cpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "test_util.h"
+#include "gtest/gtest.h"
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <boost/random.hpp>
+
+using namespace std;
+using namespace testing;
+
+class HyperscanLiteralTest
+    : public TestWithParam<tuple<unsigned /* hyperscan mode */,
+                                 unsigned /* flags to apply to all patterns */,
+                                 unsigned /* number of literals */,
+                                 pair<unsigned, unsigned> /* len min,max */,
+                                 bool /* add non-literal case */>> {
+protected:
+    virtual void SetUp() {
+        tie(mode, all_flags, num, bounds, add_non_literal) = GetParam();
+        rng.seed(29785643);
+
+        if (mode & HS_MODE_STREAM && all_flags & HS_FLAG_SOM_LEFTMOST) {
+            mode |= HS_MODE_SOM_HORIZON_LARGE;
+        }
+    }
+
+    // Returns (regex, corpus)
+    pair<string, string> random_lit(unsigned min_len, unsigned max_len) {
+        boost::random::uniform_int_distribution<> len_dist(min_len, max_len);
+        size_t len = len_dist(rng);
+
+        // Limit alphabet to [a-z] so that caseless tests include only alpha
+        // chars and can be entirely caseless.
+        boost::random::uniform_int_distribution<> dist('a', 'z');
+
+        ostringstream oss;
+        string corpus;
+        for (size_t i = 0; i < len; i++) {
+            char c = dist(rng);
+            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0')
+                << ((unsigned)c & 0xff);
+            corpus.push_back(c);
+        }
+        return {oss.str(), corpus};
+    }
+
+    virtual void TearDown() {}
+
+    boost::random::mt19937 rng;
+    unsigned mode;
+    unsigned all_flags;
+    unsigned num;
+    pair<unsigned, unsigned> bounds;
+    bool add_non_literal;
+};
+
+static
+int count_cb(unsigned, unsigned long long, unsigned long long, unsigned,
+             void *ctxt) {
+    size_t *count = (size_t *)ctxt;
+    (*count)++;
+    return 0;
+}
+
+static
+void do_scan_block(const vector<string> &corpora, const hs_database_t *db,
+                   hs_scratch_t *scratch) {
+    size_t count = 0;
+    for (const auto &s : corpora) {
+        size_t before = count;
+        hs_error_t err =
+            hs_scan(db, s.c_str(), s.size(), 0, scratch, count_cb, &count);
+        ASSERT_EQ(HS_SUCCESS, err);
+        ASSERT_LT(before, count);
+    }
+}
+
+static
+void do_scan_stream(const vector<string> &corpora, const hs_database_t *db,
+                    hs_scratch_t *scratch) {
+    size_t count = 0;
+    for (const auto &s : corpora) {
+        size_t before = count;
+        hs_stream_t *stream = nullptr;
+        hs_error_t err = hs_open_stream(db, 0, &stream);
+        ASSERT_EQ(HS_SUCCESS, err);
+        err = hs_scan_stream(stream, s.c_str(), s.size(), 0, scratch, count_cb,
+                             &count);
+        ASSERT_EQ(HS_SUCCESS, err);
+        ASSERT_LT(before, count);
+        err = hs_close_stream(stream, scratch, dummy_cb, nullptr);
+        ASSERT_EQ(HS_SUCCESS, err);
+    }
+}
+
+static
+void do_scan_vectored(const vector<string> &corpora, const hs_database_t *db,
+                      hs_scratch_t *scratch) {
+    size_t count = 0;
+    for (const auto &s : corpora) {
+        size_t before = count;
+        const char *const data[] = {s.c_str()};
+        const unsigned int data_len[] = {(unsigned int)s.size()};
+        hs_error_t err = hs_scan_vector(db, data, data_len, 1, 0, scratch,
+                                        count_cb, &count);
+        ASSERT_EQ(HS_SUCCESS, err);
+        ASSERT_LT(before, count);
+    }
+}
+
+static
+void do_scan(unsigned mode, const vector<string> &corpora,
+             const hs_database_t *db) {
+    hs_scratch_t *scratch = nullptr;
+    hs_error_t err = hs_alloc_scratch(db, &scratch);
+    ASSERT_EQ(HS_SUCCESS, err);
+
+    if (mode & HS_MODE_BLOCK) {
+        do_scan_block(corpora, db, scratch);
+    } else if (mode & HS_MODE_STREAM) {
+        do_scan_stream(corpora, db, scratch);
+    } else if (mode & HS_MODE_VECTORED) {
+        do_scan_vectored(corpora, db, scratch);
+    }
+
+    err = hs_free_scratch(scratch);
+    ASSERT_EQ(HS_SUCCESS, err);
+}
+
+TEST_P(HyperscanLiteralTest, Caseful) {
+    vector<pattern> patterns;
+    vector<string> corpora;
+    for (unsigned i = 0; i < num; i++) {
+        auto r = random_lit(bounds.first, bounds.second);
+        unsigned flags = all_flags;
+        patterns.emplace_back(std::move(r.first), flags, i);
+        corpora.emplace_back(std::move(r.second));
+    }
+
+    if (add_non_literal) {
+        patterns.emplace_back("hatstand.*teakettle", 0, num + 1);
+        corpora.push_back("hatstand teakettle");
+    }
+
+    auto *db = buildDB(patterns, mode);
+    ASSERT_TRUE(db != nullptr);
+
+    do_scan(mode, corpora, db);
+
+    hs_free_database(db);
+}
+
+TEST_P(HyperscanLiteralTest, Caseless) {
+    vector<pattern> patterns;
+    vector<string> corpora;
+    for (unsigned i = 0; i < num; i++) {
+        auto r = random_lit(bounds.first, bounds.second);
+        unsigned flags = all_flags | HS_FLAG_CASELESS;
+        patterns.emplace_back(std::move(r.first), flags, i);
+        corpora.emplace_back(std::move(r.second));
+    }
+
+    if (add_non_literal) {
+        patterns.emplace_back("hatstand.*teakettle", 0, num + 1);
+        corpora.push_back("hatstand teakettle");
+    }
+
+    auto *db = buildDB(patterns, mode);
+    ASSERT_TRUE(db != nullptr);
+
+    do_scan(mode, corpora, db);
+
+    hs_free_database(db);
+}
+
+TEST_P(HyperscanLiteralTest, MixedCase) {
+    vector<pattern> patterns;
+    vector<string> corpora;
+    for (unsigned i = 0; i < num; i++) {
+        auto r = random_lit(bounds.first, bounds.second);
+        unsigned flags = all_flags;
+        if (i % 2) {
+            flags |= HS_FLAG_CASELESS;
+        }
+        patterns.emplace_back(std::move(r.first), flags, i);
+        corpora.emplace_back(std::move(r.second));
+    }
+
+    if (add_non_literal) {
+        patterns.emplace_back("hatstand.*teakettle", 0, num + 1);
+        corpora.push_back("hatstand teakettle");
+    }
+
+    auto *db = buildDB(patterns, mode);
+    ASSERT_TRUE(db != nullptr);
+
+    do_scan(mode, corpora, db);
+
+    hs_free_database(db);
+}
+
+static const unsigned test_modes[] = {HS_MODE_BLOCK, HS_MODE_STREAM,
+                                      HS_MODE_VECTORED};
+
+static const unsigned test_flags[] = {0, HS_FLAG_SINGLEMATCH,
+                                      HS_FLAG_SOM_LEFTMOST};
+
+static const unsigned test_sizes[] = {1, 10, 100, 500, 10000};
+
+static const pair<unsigned, unsigned> test_bounds[] = {{3u, 10u}, {10u, 100u}};
+
+INSTANTIATE_TEST_CASE_P(LiteralTest, HyperscanLiteralTest,
+                        Combine(ValuesIn(test_modes), ValuesIn(test_flags),
+                                ValuesIn(test_sizes), ValuesIn(test_bounds),
+                                Bool()));
diff --git a/unit/internal/bitfield.cpp b/unit/internal/bitfield.cpp
index e5c5f0ce..40087ef7 100644
--- a/unit/internal/bitfield.cpp
+++ b/unit/internal/bitfield.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,11 @@
 
 #include "gtest/gtest.h"
 #include "util/bitfield.h"
-#include "util/ue2_containers.h"
 
 #include <algorithm>
+#include <unordered_set>
 
+using namespace std;
 using namespace ue2;
 
 template<size_t N>
@@ -393,9 +394,9 @@ TYPED_TEST(BitfieldTest, find_nth_sparse) {
 TYPED_TEST(BitfieldTest, unordered_set) {
     const size_t size = TypeParam::size();
 
-    // Exercise the hash_value free function by adding bitfields to an
+    // Exercise the hash specialisation by adding bitfields to an
     // unordered_set.
-    ue2::unordered_set<TypeParam> s;
+    unordered_set<TypeParam> s;
     s.reserve(size);
 
     for (size_t i = 0; i < size; ++i) {
diff --git a/unit/internal/depth.cpp b/unit/internal/depth.cpp
index ad9ffe38..726aa92c 100644
--- a/unit/internal/depth.cpp
+++ b/unit/internal/depth.cpp
@@ -29,9 +29,11 @@
 #include "config.h"
 
 #include "util/depth.h"
-#include "util/ue2_containers.h"
 #include "gtest/gtest.h"
 
+#include <unordered_set>
+
+using namespace std;
 using namespace ue2;
 
 static UNUSED
@@ -265,7 +267,7 @@ TEST(depth, u64a_operators) {
 }
 
 TEST(depth, unordered_set) {
-    ue2::unordered_set<depth> depths;
+    unordered_set<depth> depths;
 
     for (const auto &val : finite_values) {
         depths.emplace(val);
diff --git a/unit/internal/fdr.cpp b/unit/internal/fdr.cpp
index bd0bb4c0..87ab0974 100644
--- a/unit/internal/fdr.cpp
+++ b/unit/internal/fdr.cpp
@@ -36,9 +36,11 @@
 #include "fdr/fdr_engine_description.h"
 #include "fdr/teddy_compile.h"
 #include "fdr/teddy_engine_description.h"
+#include "hwlm/hwlm_internal.h"
 #include "util/alloc.h"
 
 #include "database.h"
+#include "scratch.h"
 #include "gtest/gtest.h"
 
 #include <algorithm>
@@ -70,53 +72,38 @@ using namespace ue2;
 namespace {
 
 struct match {
-    size_t start;
     size_t end;
     u32 id;
-    match(size_t start_in, size_t end_in, u32 id_in)
-        : start(start_in), end(end_in), id(id_in) {}
+    match(size_t end_in, u32 id_in)
+        : end(end_in), id(id_in) {}
     bool operator==(const match &b) const {
-        return start == b.start && end == b.end && id == b.id;
+        return end == b.end && id == b.id;
     }
     bool operator<(const match &b) const {
-        if (id < b.id) {
-            return true;
-        } else if (id == b.id) {
-            if (start < b.start) {
-                return true;
-            } else if (start == b.start) {
-                return end < b.end;
-            }
-        }
-        return false;
+        return tie(id, end) < tie(b.id, b.end);
     }
     match operator+(size_t adj) {
-        return match(start + adj, end + adj, id);
+        return match(end + adj, id);
     }
 };
 
+vector<match> matches;
+
 extern "C" {
 
 static
-hwlmcb_rv_t decentCallback(size_t start, size_t end, u32 id, void *ctxt) {
-    DEBUG_PRINTF("match %zu-%zu : %u\n", start, end, id);
-    if (!ctxt) {
-        return HWLM_CONTINUE_MATCHING;
-    }
+hwlmcb_rv_t decentCallback(size_t end, u32 id,
+                           UNUSED struct hs_scratch *scratch) {
+    DEBUG_PRINTF("match @%zu : %u\n", end, id);
 
-    vector<match> *out = (vector<match> *)ctxt;
-    out->push_back(match(start, end, id));
+    matches.push_back(match(end, id));
     return HWLM_CONTINUE_MATCHING;
 }
 
 static
-hwlmcb_rv_t decentCallbackT(size_t start, size_t end, u32 id, void *ctxt) {
-    if (!ctxt) {
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    vector<match> *out = (vector<match> *)ctxt;
-    out->push_back(match(start, end, id));
+hwlmcb_rv_t decentCallbackT(size_t end, u32 id,
+                            UNUSED struct hs_scratch *scratch) {
+    matches.push_back(match(end, id));
     return HWLM_TERMINATE_MATCHING;
 }
 
@@ -149,6 +136,31 @@ vector<u32> getValidFdrEngines() {
     return ret;
 }
 
+
+static
+bytecode_ptr<FDR> buildFDREngineHinted(std::vector<hwlmLiteral> &lits,
+                                       bool make_small, u32 hint,
+                                       const target_t &target,
+                                       const Grey &grey) {
+    auto proto = fdrBuildProtoHinted(HWLM_ENGINE_FDR, lits, make_small, hint,
+                                     target, grey);
+    if (!proto) {
+        return nullptr;
+    }
+    return fdrBuildTable(*proto, grey);
+}
+
+static
+bytecode_ptr<FDR> buildFDREngine(std::vector<hwlmLiteral> &lits,
+                                 bool make_small, const target_t &target,
+                                 const Grey &grey) {
+    auto proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small, target, grey);
+    if (!proto) {
+        return nullptr;
+    }
+    return fdrBuildTable(*proto, grey);
+}
+
 class FDRp : public TestWithParam<u32> {
 };
 
@@ -161,17 +173,20 @@ TEST_P(FDRp, Simple) {
     vector<hwlmLiteral> lits;
     lits.push_back(hwlmLiteral("mnopqr", 0, 0));
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     fdrExec(fdr.get(), (const u8 *)data, sizeof(data), 0, decentCallback,
-            &matches, HWLM_ALL_GROUPS);
+            &scratch, HWLM_ALL_GROUPS);
 
     ASSERT_EQ(3U, matches.size());
-    EXPECT_EQ(match(0, 5, 0), matches[0]);
-    EXPECT_EQ(match(18, 23, 0), matches[1]);
-    EXPECT_EQ(match(78, 83, 0), matches[2]);
+    EXPECT_EQ(match(5, 0), matches[0]);
+    EXPECT_EQ(match(23, 0), matches[1]);
+    EXPECT_EQ(match(83, 0), matches[2]);
+    matches.clear();
 }
 
 TEST_P(FDRp, SimpleSingle) {
@@ -183,18 +198,21 @@ TEST_P(FDRp, SimpleSingle) {
     vector<hwlmLiteral> lits;
     lits.push_back(hwlmLiteral("m", 0, 0));
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     fdrExec(fdr.get(), (const u8 *)data, sizeof(data) - 1 /* skip nul */, 0,
-            decentCallback, &matches, HWLM_ALL_GROUPS);
+            decentCallback, &scratch, HWLM_ALL_GROUPS);
 
     ASSERT_EQ(4U, matches.size());
-    EXPECT_EQ(match(0, 0, 0), matches[0]);
-    EXPECT_EQ(match(18, 18, 0), matches[1]);
-    EXPECT_EQ(match(78, 78, 0), matches[2]);
-    EXPECT_EQ(match(80, 80, 0), matches[3]);
+    EXPECT_EQ(match(0, 0), matches[0]);
+    EXPECT_EQ(match(18, 0), matches[1]);
+    EXPECT_EQ(match(78, 0), matches[2]);
+    EXPECT_EQ(match(80, 0), matches[3]);
+    matches.clear();
 }
 
 TEST_P(FDRp, MultiLocation) {
@@ -204,21 +222,24 @@ TEST_P(FDRp, MultiLocation) {
     vector<hwlmLiteral> lits;
     lits.push_back(hwlmLiteral("abc", 0, 1));
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
     const u32 testSize = 128;
 
     vector<u8> data(testSize, 0);
 
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     for (u32 i = 0; i < testSize - 3; i++) {
         memcpy(data.data() + i, "abc", 3);
-        vector<match> matches;
-        fdrExec(fdr.get(), data.data(), testSize, 0, decentCallback, &matches,
+        fdrExec(fdr.get(), data.data(), testSize, 0, decentCallback, &scratch,
                 HWLM_ALL_GROUPS);
         ASSERT_EQ(1U, matches.size());
-        EXPECT_EQ(match(i, i+2, 1), matches[0]);
+        EXPECT_EQ(match(i + 2, 1), matches[0]);
         memset(data.data() + i, 0, 3);
+        matches.clear();
     }
 }
 
@@ -231,15 +252,18 @@ TEST_P(FDRp, NoRepeat1) {
     vector<hwlmLiteral> lits
         = { hwlmLiteral("m", 0, 1, 0, HWLM_ALL_GROUPS, {}, {}) };
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     fdrExec(fdr.get(), (const u8 *)data, sizeof(data) - 1 /* skip nul */, 0,
-            decentCallback, &matches, HWLM_ALL_GROUPS);
+            decentCallback, &scratch, HWLM_ALL_GROUPS);
 
     ASSERT_EQ(1U, matches.size());
-    EXPECT_EQ(match(0, 0, 0), matches[0]);
+    EXPECT_EQ(match(0, 0), matches[0]);
+    matches.clear();
 }
 
 TEST_P(FDRp, NoRepeat2) {
@@ -252,16 +276,19 @@ TEST_P(FDRp, NoRepeat2) {
         = { hwlmLiteral("m", 0, 1, 0, HWLM_ALL_GROUPS, {}, {}),
             hwlmLiteral("A", 0, 42) };
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     fdrExec(fdr.get(), (const u8 *)data, sizeof(data) - 1 /* skip nul */, 0,
-            decentCallback, &matches, HWLM_ALL_GROUPS);
+            decentCallback, &scratch, HWLM_ALL_GROUPS);
 
     ASSERT_EQ(3U, matches.size());
-    EXPECT_EQ(match(0, 0, 0), matches[0]);
-    EXPECT_EQ(match(78, 78, 0), matches[2]);
+    EXPECT_EQ(match(0, 0), matches[0]);
+    EXPECT_EQ(match(78, 0), matches[2]);
+    matches.clear();
 }
 
 TEST_P(FDRp, NoRepeat3) {
@@ -274,15 +301,18 @@ TEST_P(FDRp, NoRepeat3) {
         = { hwlmLiteral("90m", 0, 1, 0, HWLM_ALL_GROUPS, {}, {}),
             hwlmLiteral("zA", 0, 1, 0, HWLM_ALL_GROUPS, {}, {}) };
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     fdrExec(fdr.get(), (const u8 *)data, sizeof(data) - 1 /* skip nul */, 0,
-            decentCallback, &matches, HWLM_ALL_GROUPS);
+            decentCallback, &scratch, HWLM_ALL_GROUPS);
 
     ASSERT_EQ(1U, matches.size());
-    EXPECT_EQ(match(31, 32, 0), matches[0]);
+    EXPECT_EQ(match(32, 0), matches[0]);
+    matches.clear();
 }
 
 /**
@@ -292,8 +322,7 @@ TEST_P(FDRp, NoRepeat3) {
 static
 hwlm_error_t safeExecStreaming(const FDR *fdr, const u8 *hbuf, size_t hlen,
                                const u8 *buf, size_t len, size_t start,
-                               HWLMCallback cb, void *ctxt,
-                               hwlm_group_t groups) {
+                               HWLMCallback cb, hwlm_group_t groups) {
     array<u8, 16> wrapped_history = {{'0', '1', '2', '3', '4', '5', '6', '7',
                                       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}};
     if (hlen < 16) {
@@ -301,7 +330,10 @@ hwlm_error_t safeExecStreaming(const FDR *fdr, const u8 *hbuf, size_t hlen,
         memcpy(new_hbuf, hbuf, hlen);
         hbuf = new_hbuf;
     }
-    return fdrExecStreaming(fdr, hbuf, hlen, buf, len, start, cb, ctxt, groups);
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
+    return fdrExecStreaming(fdr, hbuf, hlen, buf, len, start, cb, &scratch,
+                            groups);
 }
 
 TEST_P(FDRp, SmallStreaming) {
@@ -311,16 +343,17 @@ TEST_P(FDRp, SmallStreaming) {
     vector<hwlmLiteral> lits = {hwlmLiteral("a", 1, 1),
                                 hwlmLiteral("aardvark", 0, 10)};
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> expected, matches;
-    expected.push_back(match(0, 0, 1));
-    expected.push_back(match(1, 1, 1));
-    expected.push_back(match(2, 2, 1));
+    vector<match> expected;
+    expected.push_back(match(0, 1));
+    expected.push_back(match(1, 1));
+    expected.push_back(match(2, 1));
 
     safeExecStreaming(fdr.get(), (const u8 *)"", 0, (const u8 *)"aaar", 4, 0,
-                      decentCallback, &matches, HWLM_ALL_GROUPS);
+                      decentCallback, HWLM_ALL_GROUPS);
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i]);
     }
@@ -328,16 +361,17 @@ TEST_P(FDRp, SmallStreaming) {
     expected.clear();
     matches.clear();
 
-    expected.push_back(match(6, 6, 1));
-    expected.push_back(match(1, 8, 10));
+    expected.push_back(match(6, 1));
+    expected.push_back(match(8, 10));
 
     safeExecStreaming(fdr.get(), (const u8 *)"aaar", 4, (const u8 *)"dvark", 5,
-                      0, decentCallback, &matches, HWLM_ALL_GROUPS);
+                      0, decentCallback, HWLM_ALL_GROUPS);
 
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i] + 4);
     }
     ASSERT_EQ(expected.size(), matches.size());
+    matches.clear();
 }
 
 TEST_P(FDRp, SmallStreaming2) {
@@ -348,25 +382,27 @@ TEST_P(FDRp, SmallStreaming2) {
                                 hwlmLiteral("kk", 1, 2),
                                 hwlmLiteral("aardvark", 0, 10)};
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-    vector<match> expected, matches;
-    expected.push_back(match(6,6,1));
-    expected.push_back(match(7,7,1));
-    expected.push_back(match(11,11,1));
-    expected.push_back(match(6,13,10));
-    expected.push_back(match(13,14,2));
-    expected.push_back(match(14,15,2));
+    vector<match> expected;
+    expected.push_back(match(6,1));
+    expected.push_back(match(7,1));
+    expected.push_back(match(11,1));
+    expected.push_back(match(13,10));
+    expected.push_back(match(14,2));
+    expected.push_back(match(15,2));
 
     safeExecStreaming(fdr.get(), (const u8 *)"foobar", 6,
-                      (const u8 *)"aardvarkkk", 10, 0, decentCallback, &matches,
+                      (const u8 *)"aardvarkkk", 10, 0, decentCallback,
                       HWLM_ALL_GROUPS);
 
     for (u32 i = 0; i < MIN(expected.size(), matches.size()); i++) {
         EXPECT_EQ(expected[i], matches[i] + 6);
     }
     ASSERT_EQ(expected.size(), matches.size());
+    matches.clear();
 }
 
 TEST_P(FDRp, moveByteStream) {
@@ -378,7 +414,8 @@ TEST_P(FDRp, moveByteStream) {
     vector<hwlmLiteral> lits;
     lits.push_back(hwlmLiteral("mnopqr", 0, 0));
 
-    auto fdrTable0 = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdrTable0 = buildFDREngineHinted(lits, false, hint,
+                                          get_current_target(), Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdrTable0, hint);
 
     size_t size = fdrSize(fdrTable0.get());
@@ -394,15 +431,17 @@ TEST_P(FDRp, moveByteStream) {
     }
 
     // check matches
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
 
     hwlm_error_t fdrStatus = fdrExec(fdrTable.get(), (const u8 *)data,
-                                     data_len, 0, decentCallback, &matches,
+                                     data_len, 0, decentCallback, &scratch,
                                      HWLM_ALL_GROUPS);
     ASSERT_EQ(0, fdrStatus);
 
     ASSERT_EQ(1U, matches.size());
-    EXPECT_EQ(match(12, 17, 0), matches[0]);
+    EXPECT_EQ(match(17, 0), matches[0]);
+    matches.clear();
 }
 
 TEST_P(FDRp, Stream1) {
@@ -418,21 +457,22 @@ TEST_P(FDRp, Stream1) {
     lits.push_back(hwlmLiteral("f", 0, 0));
     lits.push_back(hwlmLiteral("literal", 0, 1));
 
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
+    auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                    Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
     // check matches
-    vector<match> matches;
 
     fdrStatus = safeExecStreaming(fdr.get(), (const u8 *)data1, data_len1,
                                   (const u8 *)data2, data_len2, 0,
-                                  decentCallback, &matches, HWLM_ALL_GROUPS);
+                                  decentCallback, HWLM_ALL_GROUPS);
     ASSERT_EQ(0, fdrStatus);
 
     ASSERT_EQ(4U, matches.size());
     for (size_t i = 0; i < matches.size(); i++) {
-        EXPECT_EQ(match(i, i, 0), matches[i]);
+        EXPECT_EQ(match(i, 0), matches[i]);
     }
+    matches.clear();
 }
 
 INSTANTIATE_TEST_CASE_P(FDR, FDRp, ValuesIn(getValidFdrEngines()));
@@ -473,12 +513,14 @@ TEST_P(FDRpp, AlignAndTooEarly) {
         aligned_free_internal);
 
     vector<hwlmLiteral> lits;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     for (size_t litLen = 1; litLen <= patLen; litLen++) {
 
         // building literal from pattern substring of variable length 1-patLen
         lits.push_back(hwlmLiteral(string(pattern, 0, litLen), 0, 0));
-        auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(),
-                                       Grey());
+        auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                        Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
         // check with buffer offset from aligned start from 0 to 31
@@ -492,11 +534,10 @@ TEST_P(FDRpp, AlignAndTooEarly) {
                         pattern.data(), litLen);
 
             for (size_t j = 0; j <= litLen; j++) {
-                vector<match> matches;
                 hwlm_error_t fdrStatus = fdrExec(fdr.get(),
                         (const u8 *)dataBufAligned.get() + i + j,
                         4 * buf_alignment - j * 2, 0, decentCallback,
-                        &matches, HWLM_ALL_GROUPS);
+                        &scratch, HWLM_ALL_GROUPS);
                 ASSERT_EQ(0, fdrStatus);
                 // j == 0 means that start and end matches are entirely within
                 // searched buffer. Otherwise they are out of buffer boundaries
@@ -506,8 +547,8 @@ TEST_P(FDRpp, AlignAndTooEarly) {
                     // we should get two and only two matches - at the beginning and
                     // at the end of unaligned buffer
                     ASSERT_EQ(2U, matches.size());
-                    ASSERT_EQ(match(0, litLen - 1, 0), matches[0]);
-                    ASSERT_EQ(match(4 * buf_alignment - litLen, 4 * buf_alignment - 1, 0), matches[1]);
+                    ASSERT_EQ(match(litLen - 1, 0), matches[0]);
+                    ASSERT_EQ(match(4 * buf_alignment - 1, 0), matches[1]);
                     matches.clear();
                 } else {
                     // "Too early" / "too late" condition - should not match anything
@@ -595,6 +636,8 @@ TEST_P(FDRpa, ShortWritings) {
     }
 
     // run the literal matching through all generated literals
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     for (size_t patIdx = 0; patIdx < pats.size();) {
         // group them in the sets of 32
         vector<hwlmLiteral> testSigs;
@@ -602,8 +645,8 @@ TEST_P(FDRpa, ShortWritings) {
             testSigs.push_back(hwlmLiteral(pats[patIdx], false, patIdx));
         }
 
-        auto fdr = fdrBuildTableHinted(testSigs, false, hint,
-                                       get_current_target(), Grey());
+        auto fdr = buildFDREngineHinted(testSigs, false, hint,
+                                        get_current_target(), Grey());
 
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
@@ -613,9 +656,8 @@ TEST_P(FDRpa, ShortWritings) {
             const string &buf = bufs[bufIdx];
             size_t bufLen = buf.size();
 
-            vector<match> matches;
             hwlm_error_t fdrStatus = fdrExec(fdr.get(), (const u8 *)buf.data(),
-                        bufLen, 0, decentCallback, &matches, HWLM_ALL_GROUPS);
+                        bufLen, 0, decentCallback, &scratch, HWLM_ALL_GROUPS);
             ASSERT_EQ(0, fdrStatus);
 
             // build the set of expected matches using standard
@@ -628,7 +670,7 @@ TEST_P(FDRpa, ShortWritings) {
 
                 for (int j = 0; j <= (int)bufLen - (int)patLen; j++) {
                     if (!buf.compare(j, patLen, pat)) {
-                        expMatches.push_back(match(j, j + patLen - 1,
+                        expMatches.push_back(match(j + patLen - 1,
                                                 testSigs[pIdx].id));
                     }
                 }
@@ -637,6 +679,7 @@ TEST_P(FDRpa, ShortWritings) {
             sort(expMatches.begin(), expMatches.end());
             sort(matches.begin(), matches.end());
             ASSERT_EQ(expMatches, matches);
+            matches.clear();
         }
     }
 }
@@ -662,18 +705,18 @@ TEST(FDR, FDRTermS) {
     lits.push_back(hwlmLiteral("f", 0, 0));
     lits.push_back(hwlmLiteral("ff", 0, 1));
 
-    auto fdr = fdrBuildTable(lits, false, get_current_target(), Grey());
+    auto fdr = buildFDREngine(lits, false, get_current_target(), Grey());
     ASSERT_TRUE(fdr != nullptr);
 
     // check matches
-    vector<match> matches;
 
     fdrStatus = safeExecStreaming(fdr.get(), (const u8 *)data1, data_len1,
                                   (const u8 *)data2, data_len2, 0,
-                                  decentCallbackT, &matches, HWLM_ALL_GROUPS);
+                                  decentCallbackT, HWLM_ALL_GROUPS);
     ASSERT_EQ(HWLM_TERMINATED, fdrStatus);
 
     ASSERT_EQ(1U, matches.size());
+    matches.clear();
 }
 
 TEST(FDR, FDRTermB) {
@@ -685,15 +728,17 @@ TEST(FDR, FDRTermB) {
     lits.push_back(hwlmLiteral("f", 0, 0));
     lits.push_back(hwlmLiteral("ff", 0, 1));
 
-    auto fdr = fdrBuildTable(lits, false, get_current_target(), Grey());
+    auto fdr = buildFDREngine(lits, false, get_current_target(), Grey());
     ASSERT_TRUE(fdr != nullptr);
 
     // check matches
-    vector<match> matches;
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
 
     fdrStatus = fdrExec(fdr.get(), (const u8 *)data1, data_len1,
-                        0, decentCallbackT, &matches, HWLM_ALL_GROUPS);
+                        0, decentCallbackT, &scratch, HWLM_ALL_GROUPS);
     ASSERT_EQ(HWLM_TERMINATED, fdrStatus);
 
     ASSERT_EQ(1U, matches.size());
+    matches.clear();
 }
diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp
index 952fffc1..81afbeaa 100644
--- a/unit/internal/fdr_flood.cpp
+++ b/unit/internal/fdr_flood.cpp
@@ -36,6 +36,8 @@
 #include "fdr/fdr_engine_description.h"
 #include "fdr/teddy_compile.h"
 #include "fdr/teddy_engine_description.h"
+#include "hwlm/hwlm_internal.h"
+#include "scratch.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
 
@@ -64,34 +66,23 @@ using namespace ue2;
 namespace {
 
 struct match {
-    size_t start;
     size_t end;
     u32 id;
-    match(size_t start_in, size_t end_in, u32 id_in)
-        : start(start_in), end(end_in), id(id_in) {}
+    match(size_t end_in, u32 id_in) : end(end_in), id(id_in) {}
     bool operator==(const match &b) const {
-        return start == b.start && end == b.end && id == b.id;
+        return end == b.end && id == b.id;
     }
     bool operator<(const match &b) const {
-        if (id < b.id) {
-            return true;
-        } else if (id == b.id) {
-            if (start < b.start) {
-                return true;
-            } else if (start == b.start) {
-                return end < b.end;
-            }
-        }
-        return false;
+        return tie(id, end) < tie(b.id, b.end);
     }
     match operator+(size_t adj) {
-        return match(start + adj, end + adj, id);
+        return match(end + adj, id);
     }
 };
 
 template<typename T>
 T &operator<<(T &a, const match &b) {
-    a << "(" << b.start << ", " << b.end << ", " << b.id << ")";
+    a << "(" << b.end << ", " << b.id << ")";
     return a;
 }
 
@@ -105,14 +96,13 @@ T &operator<<(T &a, const vector<match> &b) {
     return a;
 }
 
+map<u32, int> matchesCounts;
+
 extern "C" {
 
-static hwlmcb_rv_t countCallback(UNUSED size_t start, UNUSED size_t end, u32 id,
-                                 void *cntxt) {
-    if (cntxt) {
-        map<u32, int> *matchesCounts = (map<u32, int> *)cntxt;
-        (*matchesCounts)[id]++;
-    }
+static hwlmcb_rv_t countCallback(UNUSED size_t end, u32 id,
+                                 UNUSED struct hs_scratch *scratch) {
+    matchesCounts[id]++;
     return HWLM_CONTINUE_MATCHING;
 }
 
@@ -142,6 +132,16 @@ static vector<u32> getValidFdrEngines() {
     return ret;
 }
 
+static
+bytecode_ptr<FDR> buildFDREngineHinted(std::vector<hwlmLiteral> &lits,
+                                       bool make_small, u32 hint,
+                                       const target_t &target,
+                                       const Grey &grey) {
+    auto proto = fdrBuildProtoHinted(HWLM_ENGINE_FDR, lits, make_small, hint,
+                                     target, grey);
+    return fdrBuildTable(*proto, grey);
+}
+
 class FDRFloodp : public TestWithParam<u32> {
 };
 
@@ -152,6 +152,8 @@ TEST_P(FDRFloodp, NoMask) {
     vector<u8> data(dataSize);
     u8 c = 0;
 
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     while (1) {
         SCOPED_TRACE((unsigned int)c);
         u8 bit = 1 << (c & 0x7);
@@ -179,14 +181,12 @@ TEST_P(FDRFloodp, NoMask) {
             lits.push_back(hwlmLiteral(sAlt, false, i * 8 + 7));
         }
 
-        auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(),
-                                       Grey());
+        auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                        Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-        map<u32, int> matchesCounts;
-
         hwlm_error_t fdrStatus = fdrExec(fdr.get(), &data[0], dataSize,
-                    0, countCallback, (void *)&matchesCounts, HWLM_ALL_GROUPS);
+                    0, countCallback, &scratch, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
         for (u8 i = 0; i < 4; i++) {
@@ -211,7 +211,7 @@ TEST_P(FDRFloodp, NoMask) {
         matchesCounts.clear();
         memset(&data[0], cAlt, dataSize);
         fdrStatus = fdrExec(fdr.get(), &data[0], dataSize,
-                    0, countCallback, (void *)&matchesCounts, HWLM_ALL_GROUPS);
+                    0, countCallback, &scratch, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
         for (u8 i = 0; i < 4; i++) {
@@ -231,6 +231,7 @@ TEST_P(FDRFloodp, NoMask) {
                 ASSERT_EQ(0, matchesCounts[i * 8 + 6]);
             }
         }
+        matchesCounts.clear();
 
         if (++c == 0) {
             break;
@@ -245,6 +246,8 @@ TEST_P(FDRFloodp, WithMask) {
     vector<u8> data(dataSize);
     u8 c = '\0';
 
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     while (1) {
         u8 bit = 1 << (c & 0x7);
         u8 cAlt = c ^ bit;
@@ -315,14 +318,12 @@ TEST_P(FDRFloodp, WithMask) {
                                                     HWLM_ALL_GROUPS, msk, cmp));
             }
         }
-        auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(),
-                                       Grey());
+        auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                        Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-        map <u32, int> matchesCounts;
-
         hwlm_error_t fdrStatus = fdrExec(fdr.get(), &data[0], dataSize,
-                             0, countCallback, &matchesCounts, HWLM_ALL_GROUPS);
+                             0, countCallback, &scratch, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
         const u32 cnt4 = dataSize - 4 + 1;
@@ -360,7 +361,7 @@ TEST_P(FDRFloodp, WithMask) {
         memset(&data[0], cAlt, dataSize);
         matchesCounts.clear();
         fdrStatus = fdrExec(fdr.get(), &data[0], dataSize,
-                            0, countCallback, &matchesCounts, HWLM_ALL_GROUPS);
+                            0, countCallback, &scratch, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
         for (u8 i = 0; i < 4; i++) {
@@ -393,6 +394,7 @@ TEST_P(FDRFloodp, WithMask) {
                 ASSERT_EQ(0, matchesCounts[i * 12 + 11]);
             }
         }
+        matchesCounts.clear();
 
         if (++c == '\0') {
             break;
@@ -410,6 +412,8 @@ TEST_P(FDRFloodp, StreamingMask) {
     vector<u8> tempdata(dataSize + fake_history_size); // headroom
     u8 c = '\0';
 
+    struct hs_scratch scratch;
+    scratch.fdr_conf = NULL;
     while (1) {
         u8 bit = 1 << (c & 0x7);
         u8 cAlt = c ^ bit;
@@ -480,11 +484,10 @@ TEST_P(FDRFloodp, StreamingMask) {
                                                     HWLM_ALL_GROUPS, msk, cmp));
             }
         }
-        auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(),
-                                       Grey());
+        auto fdr = buildFDREngineHinted(lits, false, hint, get_current_target(),
+                                        Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-        map <u32, int> matchesCounts;
         hwlm_error_t fdrStatus;
         const u32 cnt4 = dataSize - 4 + 1;
 
@@ -494,7 +497,7 @@ TEST_P(FDRFloodp, StreamingMask) {
             // reference past the end of fake history to allow headroom
             const u8 *fhist = fake_history.data() + fake_history_size;
             fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0,
-                                         countCallback, &matchesCounts,
+                                         countCallback, &scratch,
                                          HWLM_ALL_GROUPS);
             ASSERT_EQ(0, fdrStatus);
             for (u32 j = streamChunk; j < dataSize; j += streamChunk) {
@@ -505,13 +508,11 @@ TEST_P(FDRFloodp, StreamingMask) {
                     const u8 *tmp_d = tempdata.data() + fake_history_size;
                     fdrStatus = fdrExecStreaming(fdr.get(), tmp_d, j, tmp_d + j,
                                                  streamChunk, 0, countCallback,
-                                                 &matchesCounts,
-                                                 HWLM_ALL_GROUPS);
+                                                 &scratch, HWLM_ALL_GROUPS);
                 } else {
                     fdrStatus = fdrExecStreaming(fdr.get(), d + j - 8, 8, d + j,
                                                  streamChunk, 0, countCallback,
-                                                 &matchesCounts,
-                                                 HWLM_ALL_GROUPS);
+                                                 &scratch, HWLM_ALL_GROUPS);
                 }
                 ASSERT_EQ(0, fdrStatus);
             }
@@ -552,6 +553,7 @@ TEST_P(FDRFloodp, StreamingMask) {
             break;
         }
     }
+    matchesCounts.clear();
 }
 
 INSTANTIATE_TEST_CASE_P(FDRFlood, FDRFloodp, ValuesIn(getValidFdrEngines()));
diff --git a/unit/internal/flat_map.cpp b/unit/internal/flat_map.cpp
index 6a81bbfe..610c71e1 100644
--- a/unit/internal/flat_map.cpp
+++ b/unit/internal/flat_map.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,7 +28,7 @@
 
 #include "config.h"
 
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "ue2common.h"
 
 #include "gtest/gtest.h"
@@ -403,6 +403,11 @@ TEST(flat_map, max_size) {
     ASSERT_LE(1ULL << 24, f.max_size());
 }
 
+template<typename FlatMap>
+size_t hash_value(const FlatMap &f) {
+    return std::hash<FlatMap>()(f);
+}
+
 TEST(flat_map, hash_value) {
     const vector<pair<u32, u32>> input = {
         {0, 0}, {3, 1}, {76, 2}, {132, 3}, {77, 4}, {99999, 5}, {100, 6}};
diff --git a/unit/internal/flat_set.cpp b/unit/internal/flat_set.cpp
index 3bee0edb..10607a6f 100644
--- a/unit/internal/flat_set.cpp
+++ b/unit/internal/flat_set.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,7 +28,7 @@
 
 #include "config.h"
 
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "ue2common.h"
 
 #include "gtest/gtest.h"
@@ -393,6 +393,11 @@ TEST(flat_set, max_size) {
     ASSERT_LE(1ULL << 24, f.max_size());
 }
 
+template<typename FlatSet>
+size_t hash_value(const FlatSet &f) {
+    return std::hash<FlatSet>()(f);
+}
+
 TEST(flat_set, hash_value) {
     const vector<u32> input = {0,        15, 3,   1,   20,  32768,
                                24000000, 17, 100, 101, 104, 99999};
diff --git a/unit/internal/insertion_ordered.cpp b/unit/internal/insertion_ordered.cpp
new file mode 100644
index 00000000..6026ce1d
--- /dev/null
+++ b/unit/internal/insertion_ordered.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ue2common.h"
+#include "util/insertion_ordered.h"
+
+#include "gtest/gtest.h"
+
+using namespace std;
+using namespace ue2;
+
+template <class K, class V>
+std::ostream &operator<<(std::ostream &os,
+                         const insertion_ordered_map<K, V> &m) {
+    os << "{";
+    for (auto it = begin(m); it != end(m); ++it) {
+        os << "{" << it->first << ", " << it->second << "}";
+        if (it != end(m)) {
+            os << ", ";
+        }
+    }
+    os << "}";
+    return os;
+}
+
+TEST(insertion_ordered_map, empty) {
+    insertion_ordered_map<u32, u32> m;
+    EXPECT_TRUE(m.empty());
+    EXPECT_TRUE(m.begin() == m.end());
+    EXPECT_EQ(0, m.size());
+
+    m.insert({10, 10});
+    EXPECT_FALSE(m.empty());
+    EXPECT_EQ(1, m.size());
+
+    m.clear();
+    EXPECT_TRUE(m.empty());
+    EXPECT_TRUE(m.begin() == m.end());
+    EXPECT_EQ(0, m.size());
+}
+
+TEST(insertion_ordered_map, insert) {
+    const vector<pair<u32, u32>> v = {{7, 1},  {1, 2},  {3, 4},
+                                      {10, 5}, {99, 6}, {12, 7}};
+    insertion_ordered_map<u32, u32> m;
+    for (const auto &e : v) {
+        m.insert(e);
+    }
+
+    EXPECT_FALSE(m.empty());
+    EXPECT_EQ(v.size(), m.size());
+    vector<pair<u32, u32>> v2(m.begin(), m.end());
+    EXPECT_EQ(v, v2);
+}
+
+TEST(insertion_ordered_map, insert_iter) {
+    const vector<pair<u32, u32>> v = {{7, 1},  {1, 2},  {3, 4},
+                                      {10, 5}, {99, 6}, {12, 7}};
+    insertion_ordered_map<u32, u32> m;
+    m.insert(v.begin(), v.end());
+
+    EXPECT_FALSE(m.empty());
+    EXPECT_EQ(v.size(), m.size());
+    vector<pair<u32, u32>> v2(m.begin(), m.end());
+    EXPECT_EQ(v, v2);
+}
+
+TEST(insertion_ordered_map, find_const) {
+    const vector<pair<u32, u32>> v = {{7, 1},  {1, 2},  {3, 4},
+                                      {10, 5}, {99, 6}, {12, 7}};
+    const insertion_ordered_map<u32, u32> m(v.begin(), v.end());
+
+    for (const auto &e : v) {
+        auto it = m.find(e.first);
+        ASSERT_NE(m.end(), it);
+        EXPECT_EQ(e.first, it->first);
+        EXPECT_EQ(e.second, it->second);
+    }
+}
+
+TEST(insertion_ordered_map, find_mutable) {
+    const vector<pair<u32, u32>> v = {{7, 1},  {1, 2},  {3, 4},
+                                      {10, 5}, {99, 6}, {12, 7}};
+    insertion_ordered_map<u32, u32> m(v.begin(), v.end());
+
+    for (const auto &e : v) {
+        auto it = m.find(e.first);
+        ASSERT_NE(m.end(), it);
+        EXPECT_EQ(e.first, it->first);
+        EXPECT_EQ(e.second, it->second);
+        auto &mut = it->second;
+        ++mut;
+        EXPECT_EQ(e.second + 1, m.at(e.first));
+    }
+}
+
+TEST(insertion_ordered_map, operator_brackets) {
+    insertion_ordered_map<u32, u32> m;
+
+    u32 val = 1000;
+    for (u32 i = 10; i > 0; i--) {
+        m[i] = val++;
+    }
+
+    EXPECT_EQ(10, m.size());
+
+    val = 1000;
+    auto it = m.begin();
+    for (u32 i = 10; i > 0; i--) {
+        ASSERT_NE(m.end(), it);
+        EXPECT_EQ(i, it->first);
+        EXPECT_EQ(val, it->second);
+        ++val;
+        ++it;
+    }
+
+    ASSERT_EQ(m.end(), it);
+}
+
+template <class K>
+std::ostream &operator<<(std::ostream &os, const insertion_ordered_set<K> &s) {
+    os << "{";
+    for (auto it = begin(s); it != end(s); ++it) {
+        os << *it;
+        if (it != end(s)) {
+            os << ", ";
+        }
+    }
+    os << "}";
+    return os;
+}
+
+TEST(insertion_ordered_set, empty) {
+    insertion_ordered_set<u32> m;
+    EXPECT_TRUE(m.empty());
+    EXPECT_TRUE(m.begin() == m.end());
+    EXPECT_EQ(0, m.size());
+
+    m.insert(10);
+    EXPECT_FALSE(m.empty());
+    EXPECT_EQ(1, m.size());
+
+    m.clear();
+    EXPECT_TRUE(m.empty());
+    EXPECT_TRUE(m.begin() == m.end());
+    EXPECT_EQ(0, m.size());
+}
+
+TEST(insertion_ordered_set, insert) {
+    const vector<u32> v = {7, 1, 3, 10, 99, 12};
+    insertion_ordered_set<u32> s;
+    for (const auto &e : v) {
+        s.insert(e);
+    }
+
+    EXPECT_FALSE(s.empty());
+    EXPECT_EQ(v.size(), s.size());
+    vector<u32> v2(s.begin(), s.end());
+    EXPECT_EQ(v, v2);
+}
+
+TEST(insertion_ordered_set, insert_iter) {
+    const vector<u32> v = {7, 1, 3, 10, 99, 12};
+    insertion_ordered_set<u32> s;
+    s.insert(v.begin(), v.end());
+
+    EXPECT_FALSE(s.empty());
+    EXPECT_EQ(v.size(), s.size());
+    vector<u32> v2(s.begin(), s.end());
+    EXPECT_EQ(v, v2);
+}
+
+TEST(insertion_ordered_set, find_const) {
+    const vector<u32> v = {7, 1, 3, 10, 99, 12};
+    const insertion_ordered_set<u32> s(v.begin(), v.end());
+
+    for (const auto &e : v) {
+        auto it = s.find(e);
+        ASSERT_NE(s.end(), it);
+        EXPECT_EQ(e, *it);
+    }
+}
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index d32f7e8f..2c585ae5 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -101,8 +101,6 @@ protected:
         ASSERT_TRUE(g != nullptr);
         clearReports(*g);
 
-        ASSERT_TRUE(isLBR(*g, grey));
-
         rm.setProgramOffset(0, MATCH_REPORT);
 
         /* LBR triggered by dot */
diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp
new file mode 100644
index 00000000..d7396b81
--- /dev/null
+++ b/unit/internal/multi_bit_compress.cpp
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "gtest/gtest.h"
+#include "ue2common.h"
+#include "util/compile_error.h"
+#include "util/make_unique.h"
+#include "util/multibit.h"
+#include "util/multibit_build.h"
+#include "util/multibit_compress.h"
+
+using namespace std;
+using namespace testing;
+using namespace ue2;
+
+/** \brief Print mmbit structure block by block. */
+UNUSED
+static
+void mmbit_display(const u8 *bits, u32 total_bits) {
+    for (u32 i = 0; i < mmbit_size(total_bits); i += 8) {
+        printf("block %d:", i / 8);
+        for (s32 j = 7; j >= 0; j--) {
+            u8 a = (*(bits + i + j));
+            printf(" %02x", a);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+/** \brief Print an MMB_TYPE block. */
+UNUSED
+static
+void mmbit_display_block(const u8 *bits) {
+    for (s32 j = 7; j >= 0; j--) {
+        u8 a = (*(bits + j));
+        printf(" %02x", a);
+    }
+    printf("\n");
+}
+
+/** \brief Print mmbit structure block by block. */
+UNUSED
+static
+void mmbit_display_comp(const u8 *bits, u32 comp_size) {
+    for (u32 i = 0; i < comp_size; i += 8) {
+        printf("block %d:", i / 8);
+        for (s32 j = 7; j >= 0; j--) {
+            u8 a = (*(bits + i + j));
+            printf(" %02x", a);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+namespace {
+class mmbit_holder {
+public:
+    mmbit_holder() {}
+    explicit mmbit_holder(u32 num_bits, u32 excess = 0)
+        : data(ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7 + excess)) {}
+    void init(u32 num_bits) {
+        assert(!data);
+        data = ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7);
+    }
+    operator u8 *() {
+        assert(data);
+        return data.get() + 7;
+    }
+    operator const u8 *() const {
+        assert(data);
+        return data.get() + 7;
+    }
+
+private:
+    unique_ptr<u8[]> data = nullptr;
+};
+
+class comp_holder {
+public:
+    comp_holder() {}
+    explicit comp_holder(u32 length)
+        : data(ue2::make_unique<u8[]>(length + 7)) {}
+    void init(u32 length) {
+        assert(!data);
+        data = ue2::make_unique<u8[]>(length + 7);
+    }
+    operator u8 *() {
+        assert(data);
+        return data.get() + 7;
+    }
+    operator const u8 *() const {
+        assert(data);
+        return data.get() + 7;
+    }
+
+private:
+    unique_ptr<u8[]> data = nullptr;
+};
+}
+
+static
+void fill_mmbit(u8 *ba, u32 test_size) {
+    fill_n(ba, mmbit_size(test_size), 0xff);
+}
+
+// We provide both test size and stride so that larger tests don't take forever
+// checking every single key.
+struct MultiBitCompTestParam {
+    u32 size;
+    u32 stride;
+};
+
+// Parameterized test case for bounded iterator, rather that propagating
+// copypasta. Allocates space as given.
+class MultiBitCompTest : public TestWithParam<MultiBitCompTestParam> {
+protected:
+    virtual void SetUp() {
+        const MultiBitCompTestParam &p = GetParam();
+        test_size = p.size;
+        stride = p.stride;
+        ba.init(test_size);
+        // blast with ones for the lulz
+        fill_mmbit(ba, test_size);
+    }
+
+    virtual void TearDown() {}
+
+    u32 test_size; // number of bits in the multibit
+    u32 stride; // stride to use for scans
+    mmbit_holder ba; // multibit storage
+};
+
+TEST(MultiBitComp, CompCompsizeSparse) {
+    static const u32 test_set[] = {
+        257,
+        4097,
+        (1U << 18) + 1,
+        (1U << 24) + 1,
+        (1U << 30) + 1
+    };
+    for (u32 i = 0; i < 5; i++) {
+        u32 test_size = test_set[i];
+        mmbit_holder ba(test_size);
+
+        // Clear all.
+        mmbit_clear(ba, test_size);
+        ASSERT_EQ(sizeof(MMB_TYPE), mmbit_compsize(ba, test_size));
+
+        // Switch 3 bits on.
+        mmbit_set(ba, test_size, 0);
+        mmbit_set(ba, test_size, test_size / 2);
+        mmbit_set(ba, test_size, test_size - 1);
+
+        switch(test_size){
+        case 257:
+            ASSERT_EQ(sizeof(MMB_TYPE) * 4, mmbit_compsize(ba, test_size));
+            break;
+        case 4097:
+            ASSERT_EQ(sizeof(MMB_TYPE) * 6, mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 18) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * 9, mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 24) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * 12, mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 30) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * 15, mmbit_compsize(ba, test_size));
+            break;
+        }
+        size_t comp_size = mmbit_compsize(ba, test_size);
+
+        // Switch 3 bits off.
+        mmbit_unset(ba, test_size, 0);
+        mmbit_unset(ba, test_size, test_size / 2);
+        mmbit_unset(ba, test_size, test_size - 1);
+
+        ASSERT_TRUE(mmbit_any(ba, test_size));
+        ASSERT_FALSE(mmbit_any_precise(ba, test_size));
+
+        ASSERT_EQ(comp_size, mmbit_compsize(ba, test_size));
+
+        // Clear all again.
+        mmbit_clear(ba, test_size);
+
+        ASSERT_FALSE(mmbit_any(ba, test_size));
+        ASSERT_FALSE(mmbit_any_precise(ba, test_size));
+
+        ASSERT_EQ(sizeof(MMB_TYPE), mmbit_compsize(ba, test_size));
+    }
+}
+
+TEST(MultiBitComp, CompCompsizeDense) {
+    static const u32 test_set[] = {
+        257,
+        4097,
+        (1U << 18) + 1,
+        (1U << 24) + 1,
+        (1U << 30) + 1
+    };
+    for (u32 i = 0; i < 5; i++) {
+        u32 test_size = test_set[i];
+        mmbit_holder ba(test_size);
+
+        // Fill all. (fill_mmbit() is not feasible.)
+        //fill_mmbit(ba, test_size);
+        mmbit_init_range(ba, test_size, 0, test_size);
+
+        switch(test_size){
+        case 257:
+            ASSERT_EQ(sizeof(MMB_TYPE) * (1 + 5),
+                      mmbit_compsize(ba, test_size));
+            break;
+        case 4097:
+            ASSERT_EQ(sizeof(MMB_TYPE) * (3 + (1 + 64)),
+                      mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 18) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * (4 + (1 + 64 + 4096)),
+                      mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 24) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * (5 + (1 + 64 + 4096 + (1U << 18))),
+                      mmbit_compsize(ba, test_size));
+            break;
+        case (1U << 30) + 1:
+            ASSERT_EQ(sizeof(MMB_TYPE) * (6 + (1 + 64 + 4096 + (1U << 18) +
+                      (1U << 24))), mmbit_compsize(ba, test_size));
+            break;
+        }
+        size_t comp_size = mmbit_compsize(ba, test_size);
+
+        // Switch 3 bits off.
+        mmbit_unset(ba, test_size, 0);
+        mmbit_unset(ba, test_size, test_size / 2);
+        mmbit_unset(ba, test_size, test_size - 1);
+
+        ASSERT_EQ(comp_size, mmbit_compsize(ba, test_size));
+
+        // Switch all bits off, not a clear-up.
+        mmbit_unset_range(ba, test_size, 0, test_size);
+
+        ASSERT_TRUE(mmbit_any(ba, test_size));
+        ASSERT_FALSE(mmbit_any_precise(ba, test_size));
+
+        ASSERT_EQ(comp_size, mmbit_compsize(ba, test_size));
+    }
+}
+
+TEST_P(MultiBitCompTest, CompCompressDecompressSparse) {
+    SCOPED_TRACE(test_size);
+    ASSERT_TRUE(ba != nullptr);
+
+    // 1st active range --> empty
+    mmbit_clear(ba, test_size);
+
+    // op 2.
+    // 2nd active range --> [1/5, 1/3)
+    u64a begin = test_size / 5;
+    u64a end = test_size / 3;
+    for (u64a i = begin; i < end; i++) {
+        mmbit_set(ba, test_size, i);
+    }
+
+    // op 3.
+    // 3rd active range --> [1/5, 1/2)
+    begin = test_size / 4;
+    end = test_size / 2;
+    for (u64a i = begin; i < end; i++) {
+        mmbit_set(ba, test_size, i);
+    }
+
+    // op 4.
+    // 4th active range --> [1/5, 1/4) and [1/3, 1/2)
+    begin = test_size / 4;
+    end = test_size / 3;
+    mmbit_unset_range(ba, test_size, begin, end);
+
+    // op 5.
+    // 5th active range --> empty
+    mmbit_clear(ba, test_size);
+
+    // op 6.
+    // 6th active range --> [1/4, 1/3)
+    for (u64a i = begin; i < end; i++) {
+        mmbit_set(ba, test_size, i);
+    }
+
+    // Initialize compression space.
+    size_t comp_size = mmbit_compsize(ba, test_size);
+    comp_holder ca(comp_size);
+    ASSERT_EQ(1, mmbit_compress(ba, test_size, ca, &comp_size, comp_size));
+
+    // Initialize decompression space.
+    mmbit_holder ba_1(test_size);
+    fill_mmbit(ba_1, test_size); // Dirty decompression space.
+    ASSERT_EQ(1, mmbit_decompress(ba_1, test_size, ca, &comp_size, comp_size));
+
+    // Correctness checking, should be [1/4, 1/3).
+    // And now, begin = test_size / 4, end = test_size / 3.
+    for (u64a i = 0; i < test_size; i += stride) {
+        if (i >= begin && i < end) {
+            ASSERT_TRUE(mmbit_isset(ba_1, test_size, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_1, test_size, i));
+        }
+    }
+}
+
+TEST_P(MultiBitCompTest, CompCompressDecompressDense) {
+    SCOPED_TRACE(test_size);
+    ASSERT_TRUE(ba != nullptr);
+
+    ASSERT_TRUE(mmbit_all(ba, test_size));
+
+    // Sequence of set/unset/clear operations.
+    // op 1.
+    // 1st active range --> [0, 1/4) and [1/3, 1)
+    u64a begin = test_size / 4;
+    u64a end = test_size / 3;
+    mmbit_unset_range(ba, test_size, begin, end);
+
+    // op 2.
+    // 2st active range --> empty
+    mmbit_clear(ba, test_size);
+
+    // op 3.
+    // 3rd active range --> [1/5, 1/2)
+    begin = test_size / 5;
+    end = test_size / 2;
+    for (u64a i = begin; i < end; i++) {
+        mmbit_set(ba, test_size, i);
+    }
+
+    // op 4.
+    // 4th active range --> [1/3, 1/2)
+    end = test_size / 3;
+    mmbit_unset_range(ba, test_size, begin, end);
+
+    // op 5.
+    //5th active range --> empty
+    begin = test_size / 4;
+    end = test_size / 2;
+    mmbit_unset_range(ba, test_size, begin, end);
+
+    // Initialize compression space.
+    size_t comp_size = mmbit_compsize(ba, test_size);
+    comp_holder ca(comp_size);
+    ASSERT_EQ(1, mmbit_compress(ba, test_size, ca, &comp_size, comp_size));
+
+    // Initialize decompression space.
+    mmbit_holder ba_1(test_size);
+    fill_mmbit(ba_1, test_size); // Dirty decompression space.
+    ASSERT_EQ(1, mmbit_decompress(ba_1, test_size, ca, &comp_size, comp_size));
+
+    // Correctness checking, should be empty.
+    if (test_size <= MMB_FLAT_MAX_BITS) {
+        ASSERT_FALSE(mmbit_any(ba, test_size));
+        ASSERT_FALSE(mmbit_any(ba_1, test_size));
+    } else {
+        ASSERT_TRUE(mmbit_any(ba, test_size));
+        ASSERT_TRUE(mmbit_any(ba_1, test_size));
+    }
+    ASSERT_FALSE(mmbit_any_precise(ba, test_size));
+    ASSERT_FALSE(mmbit_any_precise(ba_1, test_size));
+}
+
+TEST(MultiBitComp, CompIntegration1) {
+    // 256 + 1 --> smallest 2-level mmbit
+    u32 total_size = mmbit_size(257);
+    mmbit_holder ba(257);
+
+    //-------------------- 1 -----------------------//
+    // Operate on mmbit
+    mmbit_init_range(ba, 257, 0, 100);
+    // Compress
+    size_t comp_size = mmbit_compsize(ba, 257);
+    comp_holder ca(comp_size);
+    ASSERT_EQ(1, mmbit_compress(ba, 257, ca, &comp_size, comp_size));
+    // Decompress
+    mmbit_holder ba_1(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_1, 257, ca, &comp_size, comp_size));
+    // Check set range: [0,100)
+    for (u64a i = 0; i < 257; i++) {
+        if (i < 100) {
+            ASSERT_TRUE(mmbit_isset(ba_1, 257, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_1, 257, i));
+        }
+    }
+
+    //-------------------- 2 -----------------------//
+    // Operate on mmbit
+    for (u64a i = 190; i < 257; i++) {
+        mmbit_set(ba_1, 257, i);
+    }
+    // Compress
+    size_t comp_size_1 = mmbit_compsize(ba_1, 257);
+    comp_holder ca_1(comp_size_1);
+    ASSERT_EQ(1, mmbit_compress(ba_1, 257, ca_1, &comp_size_1, comp_size_1));
+    // Decompress
+    mmbit_holder ba_2(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_2, 257, ca_1, &comp_size_1, comp_size_1));
+    // Check set range: [0,100) and [190,257)
+    for (u64a i = 0; i < 257; i++) {
+        if (i < 100 || i >= 190) {
+            ASSERT_TRUE(mmbit_isset(ba_2, 257, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_2, 257, i));
+        }
+    }
+
+    //-------------------- 3 -----------------------//
+    // Operate on mmbit
+    mmbit_unset_range(ba_2, 257, 190, 192);
+    // Compress
+    size_t comp_size_2 = mmbit_compsize(ba_2, 257);
+    comp_holder ca_2(comp_size_2);
+    ASSERT_EQ(1, mmbit_compress(ba_2, 257, ca_2, &comp_size_2, comp_size_2));
+    // Decompress
+    mmbit_holder ba_3(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_3, 257, ca_2, &comp_size_2, comp_size_2));
+    // Check set range: [0,100) and [192,257)
+    for (u64a i = 0; i < 257; i++) {
+        if (i < 100 || i >= 192) {
+            ASSERT_TRUE(mmbit_isset(ba_3, 257, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_3, 257, i));
+        }
+    }
+
+    //-------------------- 4 -----------------------//
+    // Operate on mmbit
+    for (u64a i = 100; i < 200; i++) {
+        mmbit_set(ba_3, 257, i);
+    }
+    // Compress
+    size_t comp_size_3 = mmbit_compsize(ba_3, 257);
+    comp_holder ca_3(comp_size_3);
+    ASSERT_EQ(1, mmbit_compress(ba_3, 257, ca_3, &comp_size_3, comp_size_3));
+    // Decompress
+    mmbit_holder ba_4(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_4, 257, ca_3, &comp_size_3, comp_size_3));
+    // Check set range: full
+    ASSERT_TRUE(mmbit_all(ba_4, 257));
+
+    //-------------------- 5 -----------------------//
+    // Operate on mmbit
+    mmbit_clear(ba_4, 257);
+    // Compress
+    size_t comp_size_4 = mmbit_compsize(ba_4, 257);
+    comp_holder ca_4(comp_size_4);
+    ASSERT_EQ(1, mmbit_compress(ba_4, 257, ca_4, &comp_size_4, comp_size_4));
+    // Decompress
+    mmbit_holder ba_5(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_5, 257, ca_4, &comp_size_4, comp_size_4));
+    // Check set range: empty
+    ASSERT_FALSE(mmbit_any(ba_5, 257));
+    ASSERT_FALSE(mmbit_any_precise(ba_5, 257));
+
+    //-------------------- 6 -----------------------//
+    // Operate on mmbit
+    for (u64a i = 100; i < 200; i++) {
+        mmbit_set(ba_5, 257, i);
+    }
+    // Compress
+    size_t comp_size_5 = mmbit_compsize(ba_5, 257);
+    comp_holder ca_5(comp_size_5);
+    ASSERT_EQ(1, mmbit_compress(ba_5, 257, ca_5, &comp_size_5, comp_size_5));
+    // Decompress
+    mmbit_holder ba_6(257);
+    ASSERT_EQ(1, mmbit_decompress(ba_6, 257, ca_5, &comp_size_5, comp_size_5));
+    // Check set range: [100,200)
+    for (u64a i = 0; i < 257; i++) {
+        if (i >= 100 && i < 200) {
+            ASSERT_TRUE(mmbit_isset(ba_6, 257, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_6, 257, i));
+        }
+    }
+}
+
+TEST(MultiBitComp, CompIntegration2) {
+    // 64^2 + 1 --> smallest 3-level mmbit
+    u32 total_size = mmbit_size(4097);
+    mmbit_holder ba(4097);
+
+    //-------------------- 1 -----------------------//
+    // Operate on mmbit
+    mmbit_init_range(ba, 4097, 0, 3200);
+    // Compress
+    size_t comp_size = mmbit_compsize(ba, 4097);
+    comp_holder ca(comp_size);
+    ASSERT_EQ(1, mmbit_compress(ba, 4097, ca, &comp_size, comp_size));
+    // Decompress
+    mmbit_holder ba_1(4097);
+    ASSERT_EQ(1, mmbit_decompress(ba_1, 4097, ca, &comp_size, comp_size));
+    // Check set range: [0, 3200)
+    for (u64a i = 0; i < 4097; i++) {
+        if (i < 3200) {
+            ASSERT_TRUE(mmbit_isset(ba_1, 4097, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_1, 4097, i));
+        }
+    }
+
+    //-------------------- 2 -----------------------//
+    // Operate on mmbit
+    mmbit_unset_range(ba_1, 4097, 320, 640);
+    // Compress
+    size_t comp_size_1 = mmbit_compsize(ba_1, 4097);
+    comp_holder ca_1(comp_size_1);
+    ASSERT_EQ(1, mmbit_compress(ba_1, 4097, ca_1, &comp_size_1, comp_size_1));
+    // Decompress
+    mmbit_holder ba_2(4097);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_2, 4097, ca_1, &comp_size_1, comp_size_1));
+    // Check set range: [0, 320) and [640, 3200)
+    for (u64a i = 0; i < 4097; i++) {
+        if (i < 320 || (i >= 640 && i < 3200)) {
+            ASSERT_TRUE(mmbit_isset(ba_2, 4097, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_2, 4097, i));
+        }
+    }
+
+    //-------------------- 3 -----------------------//
+    // Operate on mmbit
+    for (u64a i = 3000; i < 4000; i++) {
+        mmbit_set(ba_2, 4097, i);
+    }
+    // Compress
+    size_t comp_size_2 = mmbit_compsize(ba_2, 4097);
+    comp_holder ca_2(comp_size_2);
+    ASSERT_EQ(1, mmbit_compress(ba_2, 4097, ca_2, &comp_size_2, comp_size_2));
+    // Decompress
+    mmbit_holder ba_3(4097);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_3, 4097, ca_2, &comp_size_2, comp_size_2));
+    // Check set range: [0, 320) and [640, 4000)
+    for (u64a i = 0; i < 4097; i++) {
+        if (i < 320 || (i >= 640 && i < 4000)) {
+            ASSERT_TRUE(mmbit_isset(ba_3, 4097, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_3, 4097, i));
+        }
+    }
+
+    //-------------------- 4 -----------------------//
+    // Operate on mmbit
+    mmbit_unset(ba_3, 4097, 64);
+    mmbit_unset(ba_3, 4097, 3200);
+    // Compress
+    size_t comp_size_3 = mmbit_compsize(ba_3, 4097);
+    comp_holder ca_3(comp_size_3);
+    ASSERT_EQ(1, mmbit_compress(ba_3, 4097, ca_3, &comp_size_3, comp_size_3));
+    // Decompress
+    mmbit_holder ba_4(4097);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_4, 4097, ca_3, &comp_size_3, comp_size_3));
+    // Check set range: [0,64) and [65, 320) and [640, 3200) and [3201, 4000)
+    for (u64a i = 0; i < 4097; i++) {
+        if (i < 64 || (i >= 65 && i < 320) || (i >= 640 && i < 3200) ||
+            (i >= 3201 && i < 4000)) {
+            ASSERT_TRUE(mmbit_isset(ba_4, 4097, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_4, 4097, i));
+        }
+    }
+
+    //-------------------- 5 -----------------------//
+    // Operate on mmbit
+    for (u64a i = 0; i < 4097; i++) {
+        if (i < 64 || (i >= 65 && i < 320) || (i >= 640 && i < 3200) ||
+            (i >= 3201 && i < 4000)) {
+            mmbit_unset(ba_4, 4097, i);
+        }
+    }
+    // Compress
+    size_t comp_size_4 = mmbit_compsize(ba_4, 4097);
+    comp_holder ca_4(comp_size_4);
+    ASSERT_EQ(1, mmbit_compress(ba_4, 4097, ca_4, &comp_size_4, comp_size_4));
+    // Decompress
+    mmbit_holder ba_5(4097);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_5, 4097, ca_4, &comp_size_4, comp_size_4));
+    // Check set range: empty
+    ASSERT_TRUE(mmbit_any(ba_5, 4097));
+    ASSERT_FALSE(mmbit_any_precise(ba_5, 4097));
+
+    //-------------------- 6 -----------------------//
+    // Operate on mmbit
+    mmbit_set(ba_5, 4097, 4096);
+    // Compress
+    size_t comp_size_5 = mmbit_compsize(ba_5, 4097);
+    comp_holder ca_5(comp_size_5);
+    ASSERT_EQ(1, mmbit_compress(ba_5, 4097, ca_5, &comp_size_5, comp_size_5));
+    // Decompress
+    mmbit_holder ba_6(4097);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_6, 4097, ca_5, &comp_size_5, comp_size_5));
+    // Check set range: [4096, 4096]
+    for (u64a i = 0; i < 4097; i++) {
+        if (i == 4096) {
+             ASSERT_TRUE(mmbit_isset(ba_6, 4097, i));
+        } else {
+             ASSERT_FALSE(mmbit_isset(ba_6, 4097, i));
+        }
+    }
+}
+
+TEST(MultiBitComp, CompIntegration3) {
+    // 64^3 + 1 --> smallest 4-level mmbit
+    u32 total_size = mmbit_size(262145);
+    mmbit_holder ba(262145);
+
+    //-------------------- 1 -----------------------//
+    // Operate on mmbit
+    mmbit_init_range(ba, 262145, 0, 262145);
+    // Compress
+    size_t comp_size = mmbit_compsize(ba, 262145);
+    comp_holder ca(comp_size);
+    ASSERT_EQ(1, mmbit_compress(ba, 262145, ca, &comp_size, comp_size));
+    // Decompress
+    mmbit_holder ba_1(262145);
+    ASSERT_EQ(1, mmbit_decompress(ba_1, 262145, ca, &comp_size, comp_size));
+    // Check set range: full
+    ASSERT_TRUE(mmbit_all(ba_1, 262145));
+
+    //-------------------- 2 -----------------------//
+    // Operate on mmbit
+    mmbit_unset_range(ba_1, 262145, 0, 64000);
+    // Compress
+    size_t comp_size_1 = mmbit_compsize(ba_1, 262145);
+    comp_holder ca_1(comp_size_1);
+    ASSERT_EQ(1,
+              mmbit_compress(ba_1, 262145, ca_1, &comp_size_1, comp_size_1));
+    // Decompress
+    mmbit_holder ba_2(262145);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_2, 262145, ca_1, &comp_size_1, comp_size_1));
+    // Check set range: [64000, 262145)
+    for (u64a i = 0; i < 262145; i++) {
+        if (i < 64000) {
+            ASSERT_FALSE(mmbit_isset(ba_2, 262145, i));
+        } else {
+            ASSERT_TRUE(mmbit_isset(ba_2, 262145, i));
+        }
+    }
+
+    //-------------------- 3 -----------------------//
+    // Operate on mmbit
+    mmbit_unset_range(ba_2, 262145, 64001, 256000);
+    // Compress
+    size_t comp_size_2 = mmbit_compsize(ba_2, 262145);
+    comp_holder ca_2(comp_size_2);
+    ASSERT_EQ(1,
+              mmbit_compress(ba_2, 262145, ca_2, &comp_size_2, comp_size_2));
+    // Decompress
+    mmbit_holder ba_3(262145);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_3, 262145, ca_2, &comp_size_2, comp_size_2));
+    // Check set range: [64000, 64000] and [256000, 262145)
+    for (u64a i = 0; i < 262145; i++) {
+        if (i == 64000 || i >= 256000) {
+            ASSERT_TRUE(mmbit_isset(ba_3, 262145, i));
+        } else {
+            ASSERT_FALSE(mmbit_isset(ba_3, 262145, i));
+        }
+    }
+
+    //-------------------- 4 -----------------------//
+    // Operate on mmbit
+    mmbit_unset_range(ba_3, 262145, 256001, 262145);
+    // Compress
+    size_t comp_size_3 = mmbit_compsize(ba_3, 262145);
+    comp_holder ca_3(comp_size_3);
+    ASSERT_EQ(1,
+              mmbit_compress(ba_3, 262145, ca_3, &comp_size_3, comp_size_3));
+    // Decompress
+    mmbit_holder ba_4(262145);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_4, 262145, ca_3, &comp_size_3, comp_size_3));
+    // Check set range: [64000, 64000] and [256000, 256000]
+    ASSERT_EQ(64000, mmbit_iterate(ba_4, 262145, MMB_INVALID));
+    ASSERT_EQ(256000, mmbit_iterate(ba_4, 262145, 64000));
+    ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba_4, 262145, 256000));
+
+    //-------------------- 5 -----------------------//
+    // Operate on mmbit
+    mmbit_unset(ba_4, 262145, 64000);
+    mmbit_unset(ba_4, 262145, 256000);
+    // Compress
+    size_t comp_size_4 = mmbit_compsize(ba_4, 262145);
+    comp_holder ca_4(comp_size_4);
+    ASSERT_EQ(1,
+              mmbit_compress(ba_4, 262145, ca_4, &comp_size_4, comp_size_4));
+    // Decompress
+    mmbit_holder ba_5(262145);
+    ASSERT_EQ(1,
+              mmbit_decompress(ba_5, 262145, ca_4, &comp_size_4, comp_size_4));
+    // Check set range: empty
+    ASSERT_TRUE(mmbit_any(ba_5, 262145));
+    ASSERT_FALSE(mmbit_any_precise(ba_5, 262145));
+}
+
+static const MultiBitCompTestParam multibitCompTests[] = {
+    // We provide both test size and stride so that larger tests don't take
+    // forever checking every single key.
+
+    // Small cases, stride 1.
+	{ 4, 1 },
+    { 7, 1 },
+    { 8, 1 },
+    { 13, 1 },
+    { 16, 1 },
+    { 17, 1 },
+    { 32, 1 },
+    { 33, 1 },
+    { 57, 1 },
+    { 64, 1 },
+    { 65, 1 },
+    { 100, 1 },
+    { 128, 1 },
+    { 200, 1 },
+    { 256, 1 },
+    { 257, 1 },     // 257 = 256 + 1
+    { 302, 1 },
+    { 1024, 1 },
+    { 1025, 1 },
+    { 2099, 1 },    // 4097 = 64 ^ 2 + 1
+    { 4097, 1 },
+    { 10000, 1 },
+    { 32768, 1 },
+    { 32769, 1 },
+    { 200000, 1 },
+    { 262145, 1 },  // 262145 = 64 * 3 + 1
+
+    // Larger cases, bigger strides.
+    { 1U << 19, 3701 },
+    { 1U << 20, 3701 },
+    { 1U << 21, 3701 },
+    { 1U << 22, 3701 },
+    { 1U << 23, 3701 },
+    { 1U << 24, 3701 },
+    { 1U << 25, 3701 },
+    { 1U << 26, 3701 },
+    { 1U << 27, 7919 },
+    { 1U << 28, 15073 },
+    { 1U << 29, 24413 },
+    { 1U << 30, 50377 },
+    { 1U << 31, 104729 },
+};
+
+INSTANTIATE_TEST_CASE_P(MultiBitComp, MultiBitCompTest,
+                        ValuesIn(multibitCompTests));
diff --git a/unit/internal/nfagraph_util.cpp b/unit/internal/nfagraph_util.cpp
index b6952f5a..e6a58b55 100644
--- a/unit/internal/nfagraph_util.cpp
+++ b/unit/internal/nfagraph_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -79,8 +79,8 @@ TEST(NFAGraph, split1) {
 
     NFAVertex pivot = c;
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
@@ -130,8 +130,8 @@ TEST(NFAGraph, split2) {
 
     NFAVertex pivot = c;
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
@@ -203,8 +203,8 @@ TEST(NFAGraph, split3) {
     pivots.push_back(d);
     pivots.push_back(g);
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
@@ -280,8 +280,8 @@ TEST(NFAGraph, split4) {
     pivots.push_back(d);
     pivots.push_back(g);
 
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+    unordered_map<NFAVertex, NFAVertex> lhs_map;
+    unordered_map<NFAVertex, NFAVertex> rhs_map;
 
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
diff --git a/unit/internal/noodle.cpp b/unit/internal/noodle.cpp
index 5df66236..7cf5744f 100644
--- a/unit/internal/noodle.cpp
+++ b/unit/internal/noodle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "hwlm/noodle_engine.h"
 #include "hwlm/hwlm.h"
 #include "hwlm/hwlm_literal.h"
+#include "scratch.h"
 #include "util/alloc.h"
 #include "util/ue2string.h"
 
@@ -45,204 +46,184 @@ using std::vector;
 using namespace ue2;
 
 struct hlmMatchEntry {
-    size_t from;
     size_t to;
     u32 id;
-    hlmMatchEntry(size_t start, size_t end, u32 identifier) :
-            from(start), to(end), id(identifier) {}
+    hlmMatchEntry(size_t end, u32 identifier) :
+            to(end), id(identifier) {}
 };
 
-typedef vector<hlmMatchEntry> hlmMatchRecord;
+vector<hlmMatchEntry> ctxt;
 
 static
-hwlmcb_rv_t hlmSimpleCallback(size_t from, size_t to, u32 id, void *context) {
-    hlmMatchRecord *mr = (hlmMatchRecord *)context;
+hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
+                              UNUSED struct hs_scratch *scratch) {
+    DEBUG_PRINTF("match @%zu = %u\n", to, id);
 
-    DEBUG_PRINTF("match @%zu = %u,%p\n", to, id, context);
-
-    mr->push_back(hlmMatchEntry(from, to, id));
+    ctxt.push_back(hlmMatchEntry(to, id));
 
     return HWLM_CONTINUE_MATCHING;
 }
 
 static
 void noodleMatch(const u8 *data, size_t data_len, const char *lit_str,
-                 size_t lit_len, char nocase, HWLMCallback cb, void *ctxt) {
+                 size_t lit_len, char nocase, HWLMCallback cb) {
     u32 id = 1000;
     hwlmLiteral lit(std::string(lit_str, lit_len), nocase, id);
     auto n = noodBuildTable(lit);
     ASSERT_TRUE(n != nullptr);
 
     hwlm_error_t rv;
-    rv = noodExec(n.get(), data, data_len, 0, cb, ctxt);
+    struct hs_scratch scratch;
+    rv = noodExec(n.get(), data, data_len, 0, cb, &scratch);
     ASSERT_EQ(HWLM_SUCCESS, rv);
 }
 
 TEST(Noodle, nood1) {
     const size_t data_len = 1024;
     unsigned int i, j;
-    hlmMatchRecord ctxt;
     u8 data[data_len];
 
     memset(data, 'a', data_len);
 
-    noodleMatch(data, data_len, "a", 1, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "a", 1, 0, hlmSimpleCallback);
     ASSERT_EQ(1024U, ctxt.size());
     for (i = 0; i < 1024; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i, ctxt[i].to);
     }
 
     ctxt.clear();
-    noodleMatch(data, data_len, "A", 1, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "A", 1, 0, hlmSimpleCallback);
     ASSERT_EQ(0U, ctxt.size());
 
     ctxt.clear();
-    noodleMatch(data, data_len, "A", 1, 1, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "A", 1, 1, hlmSimpleCallback);
     ASSERT_EQ(1024U, ctxt.size());
     for (i = 0; i < 1024; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i, ctxt[i].to);
     }
 
     for (j = 0; j < 16; j++) {
         ctxt.clear();
-        noodleMatch(data + j, data_len - j, "A", 1, 1, hlmSimpleCallback,
-                    &ctxt);
+        noodleMatch(data + j, data_len - j, "A", 1, 1, hlmSimpleCallback);
         ASSERT_EQ(1024 - j, ctxt.size());
         for (i = 0; i < 1024 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i, ctxt[i].to);
         }
 
         ctxt.clear();
-        noodleMatch(data, data_len - j, "A", 1, 1, hlmSimpleCallback, &ctxt);
+        noodleMatch(data, data_len - j, "A", 1, 1, hlmSimpleCallback);
         ASSERT_EQ(1024 - j, ctxt.size());
         for (i = 0; i < 1024 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i, ctxt[i].to);
         }
     }
+    ctxt.clear();
 }
 
 TEST(Noodle, nood2) {
     const size_t data_len = 1024;
     unsigned int i, j;
-    hlmMatchRecord ctxt;
     u8 data[data_len];
 
     memset(data, 'a', data_len);
 
-    noodleMatch(data, data_len, "aa", 2, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aa", 2, 0, hlmSimpleCallback);
     ASSERT_EQ(1023U, ctxt.size());
     for (i = 0; i < 1023; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 1, ctxt[i].to);
     }
 
     ctxt.clear();
-    noodleMatch(data, data_len, "aA", 2, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aA", 2, 0, hlmSimpleCallback);
     ASSERT_EQ(0U, ctxt.size());
 
     ctxt.clear();
-    noodleMatch(data, data_len, "AA", 2, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "AA", 2, 0, hlmSimpleCallback);
     ASSERT_EQ(0U, ctxt.size());
 
     ctxt.clear();
-    noodleMatch(data, data_len, "aa", 2, 1, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aa", 2, 1, hlmSimpleCallback);
     ASSERT_EQ(1023U, ctxt.size());
     for (i = 0; i < 1023; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 1, ctxt[i].to);
     }
 
     ctxt.clear();
-    noodleMatch(data, data_len, "Aa", 2, 1, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "Aa", 2, 1, hlmSimpleCallback);
     ASSERT_EQ(1023U, ctxt.size());
     for (i = 0; i < 1023; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 1, ctxt[i].to);
     }
 
     ctxt.clear();
-    noodleMatch(data, data_len, "AA", 2, 1, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "AA", 2, 1, hlmSimpleCallback);
     ASSERT_EQ(1023U, ctxt.size());
     for (i = 0; i < 1023; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 1, ctxt[i].to);
     }
 
     for (j = 0; j < 16; j++) {
         ctxt.clear();
-        noodleMatch(data + j, data_len - j, "Aa", 2, 1, hlmSimpleCallback,
-                    &ctxt);
+        noodleMatch(data + j, data_len - j, "Aa", 2, 1, hlmSimpleCallback);
         ASSERT_EQ(1023 - j, ctxt.size());
         for (i = 0; i < 1023 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i + 1, ctxt[i].to);
         }
 
         ctxt.clear();
-        noodleMatch(data, data_len - j, "aA", 2, 1, hlmSimpleCallback, &ctxt);
+        noodleMatch(data, data_len - j, "aA", 2, 1, hlmSimpleCallback);
         ASSERT_EQ(1023 - j, ctxt.size());
         for (i = 0; i < 1023 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i + 1, ctxt[i].to);
         }
     }
+    ctxt.clear();
 }
 
 TEST(Noodle, noodLong) {
     const size_t data_len = 1024;
     unsigned int i, j;
-    hlmMatchRecord ctxt;
     u8 data[data_len];
 
     memset(data, 'a', data_len);
 
-    noodleMatch(data, data_len, "aaaa", 4, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aaaa", 4, 0, hlmSimpleCallback);
     ASSERT_EQ(1021U, ctxt.size());
     for (i = 0; i < 1021; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 3, ctxt[i].to);
     }
 
     ctxt.clear();
-    noodleMatch(data, data_len, "aaAA", 4, 0, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aaAA", 4, 0, hlmSimpleCallback);
     ASSERT_EQ(0U, ctxt.size());
 
     ctxt.clear();
-    noodleMatch(data, data_len, "aaAA", 4, 1, hlmSimpleCallback, &ctxt);
+    noodleMatch(data, data_len, "aaAA", 4, 1, hlmSimpleCallback);
     ASSERT_EQ(1021U, ctxt.size());
     for (i = 0; i < 1021; i++) {
-        ASSERT_EQ(i, ctxt[i].from);
         ASSERT_EQ(i + 3, ctxt[i].to);
     }
 
     for (j = 0; j < 16; j++) {
         ctxt.clear();
-        noodleMatch(data + j, data_len - j, "AAaa", 4, 1, hlmSimpleCallback,
-                    &ctxt);
+        noodleMatch(data + j, data_len - j, "AAaa", 4, 1, hlmSimpleCallback);
         ASSERT_EQ(1021 - j, ctxt.size());
         for (i = 0; i < 1021 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i + 3, ctxt[i].to);
         }
 
         ctxt.clear();
-        noodleMatch(data + j, data_len - j, "aaaA", 4, 1, hlmSimpleCallback,
-                    &ctxt);
+        noodleMatch(data + j, data_len - j, "aaaA", 4, 1, hlmSimpleCallback);
         ASSERT_EQ(1021 - j, ctxt.size());
         for (i = 0; i < 1021 - j; i++) {
-            ASSERT_EQ(i, ctxt[i].from);
             ASSERT_EQ(i + 3, ctxt[i].to);
         }
     }
+    ctxt.clear();
 }
 
 TEST(Noodle, noodCutoverSingle) {
     const size_t max_data_len = 128;
-    hlmMatchRecord ctxt;
     u8 data[max_data_len + 15];
 
     memset(data, 'a', max_data_len + 15);
@@ -250,19 +231,18 @@ TEST(Noodle, noodCutoverSingle) {
     for (u32 align = 0; align < 16; align++) {
         for (u32 len = 0; len < max_data_len; len++) {
             ctxt.clear();
-            noodleMatch(data + align, len, "a", 1, 0, hlmSimpleCallback, &ctxt);
+            noodleMatch(data + align, len, "a", 1, 0, hlmSimpleCallback);
             EXPECT_EQ(len, ctxt.size());
             for (u32 i = 0; i < ctxt.size(); i++) {
-                ASSERT_EQ(i, ctxt[i].from);
                 ASSERT_EQ(i, ctxt[i].to);
             }
         }
     }
+    ctxt.clear();
 }
 
 TEST(Noodle, noodCutoverDouble) {
     const size_t max_data_len = 128;
-    hlmMatchRecord ctxt;
     u8 data[max_data_len + 15];
 
     memset(data, 'a', max_data_len + 15);
@@ -270,14 +250,13 @@ TEST(Noodle, noodCutoverDouble) {
     for (u32 align = 0; align < 16; align++) {
         for (u32 len = 0; len < max_data_len; len++) {
             ctxt.clear();
-            noodleMatch(data + align, len, "aa", 2, 0, hlmSimpleCallback,
-                        &ctxt);
+            noodleMatch(data + align, len, "aa", 2, 0, hlmSimpleCallback);
             EXPECT_EQ(len ? len - 1 : 0U, ctxt.size());
             for (u32 i = 0; i < ctxt.size(); i++) {
-                ASSERT_EQ(i, ctxt[i].from);
                 ASSERT_EQ(i + 1, ctxt[i].to);
             }
         }
     }
+    ctxt.clear();
 }
 
diff --git a/unit/internal/rose_build_merge.cpp b/unit/internal/rose_build_merge.cpp
index 291c241a..5029f0a5 100644
--- a/unit/internal/rose_build_merge.cpp
+++ b/unit/internal/rose_build_merge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "rose/rose_build.h"
 #include "rose/rose_build_impl.h"
 #include "rose/rose_build_merge.h"
+#include "rose/rose_build_role_aliasing.h"
 #include "util/report_manager.h"
 #include "util/boundary_reports.h"
 #include "util/compile_context.h"
@@ -42,7 +43,11 @@
 #include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 
-using std::vector;
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+using namespace std;
 using namespace ue2;
 
 static
@@ -78,7 +83,7 @@ RoseVertex addVertex(RoseBuildImpl &build, RoseVertex parent, u32 lit_id) {
 
 static
 size_t numUniqueSuffixGraphs(const RoseGraph &g) {
-    ue2::unordered_set<const NGHolder *> seen;
+    unordered_set<const NGHolder *> seen;
 
     for (const auto &v : vertices_range(g)) {
         if (g[v].suffix) {
diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp
index 06407c41..0c9d2607 100644
--- a/unit/internal/shufti.cpp
+++ b/unit/internal/shufti.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -910,12 +910,13 @@ TEST(ReverseShufti, ExecNoMatch1) {
     int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
-    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rshuftiExec(lo, hi, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
@@ -929,12 +930,13 @@ TEST(ReverseShufti, ExecNoMatch2) {
     int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
-    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rshuftiExec(lo, hi, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
@@ -947,12 +949,13 @@ TEST(ReverseShufti, ExecNoMatch3) {
     int ret = shuftiBuildMasks(chars, (u8 *)&lo, (u8 *)&hi);
     ASSERT_NE(-1, ret);
 
-    char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
+    char t[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rshuftiExec(lo, hi, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
diff --git a/unit/internal/truffle.cpp b/unit/internal/truffle.cpp
index e9e4f19c..988eb13c 100644
--- a/unit/internal/truffle.cpp
+++ b/unit/internal/truffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -391,12 +391,13 @@ TEST(ReverseTruffle, ExecNoMatch1) {
 
     truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
-    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
@@ -410,12 +411,13 @@ TEST(ReverseTruffle, ExecNoMatch2) {
 
     truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
-    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
@@ -427,12 +429,13 @@ TEST(ReverseTruffle, ExecNoMatch3) {
 
     truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2);
 
-    char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
+    char t[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
+    char *t1 = t + 1;
     size_t len = strlen(t1);
 
     for (size_t i = 0; i < 16; i++) {
         const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i);
-        ASSERT_EQ((const u8 *)(t1 - 1), rv);
+        ASSERT_EQ((const u8 *)t, rv);
     }
 }
 
diff --git a/unit/internal/util_string.cpp b/unit/internal/util_string.cpp
index d6f7285a..f501f66b 100644
--- a/unit/internal/util_string.cpp
+++ b/unit/internal/util_string.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,18 @@
 
 using namespace ue2;
 
+#if defined(DUMP_SUPPORT)
+
+namespace ue2 {
+
+static void PrintTo(const ue2_literal &lit, ::std::ostream *os) {
+    *os << dumpString(lit);
+}
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
 TEST(string, case_iter1) {
     const char * const expected[] = {
         "3FOO-BAR",
diff --git a/util/cross_compile.cpp b/util/cross_compile.cpp
index b4d1f5f1..0d136998 100644
--- a/util/cross_compile.cpp
+++ b/util/cross_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,11 +39,12 @@
 using namespace std;
 
 struct XcompileMode {
-    const char *name;
+    const string name;
     unsigned long long cpu_features;
 };
 
 static const XcompileMode xcompile_options[] = {
+    { "avx512", HS_CPU_FEATURES_AVX512 },
     { "avx2", HS_CPU_FEATURES_AVX2 },
     { "base", 0 },
 };
@@ -60,12 +61,9 @@ unique_ptr<hs_platform_info> xcompileReadMode(const char *s) {
     bool found_mode = false;
 
     if (!opt.empty()) {
-        const size_t numOpts = ARRAY_LENGTH(xcompile_options);
-        for (size_t i = 0; i < numOpts; i++) {
-            if (opt.compare(xcompile_options[i].name) == 0) {
-                DEBUG_PRINTF("found opt %zu:%llu\n", i,
-                             xcompile_options[i].cpu_features);
-                rv.cpu_features = xcompile_options[i].cpu_features;
+        for (const auto &xcompile : xcompile_options) {
+            if (opt == xcompile.name) {
+                rv.cpu_features = xcompile.cpu_features;
                 found_mode = true;
                 break;
             }
@@ -88,6 +86,11 @@ string to_string(const hs_platform_info &p) {
 
     if (p.cpu_features) {
         u64a features = p.cpu_features;
+        if (features & HS_CPU_FEATURES_AVX512) {
+            out << " avx512";
+            features &= ~HS_CPU_FEATURES_AVX512;
+        }
+
         if (features & HS_CPU_FEATURES_AVX2) {
             out << " avx2";
             features &= ~HS_CPU_FEATURES_AVX2;
@@ -103,13 +106,11 @@ string to_string(const hs_platform_info &p) {
 
 string xcompileUsage(void) {
     string variants = "Instruction set options: ";
-    const size_t numOpts = ARRAY_LENGTH(xcompile_options);
-    for (size_t i = 0; i < numOpts; i++) {
-        variants += xcompile_options[i].name;
-        if (i + 1 != numOpts) {
-            variants += ", ";
-        }
+    const auto commaspace = ", ";
+    auto sep = "";
+    for (const auto &xcompile : xcompile_options) {
+        variants += sep + xcompile.name;
+        sep = commaspace;
     }
-
     return variants;
 }
diff --git a/util/expressions.cpp b/util/expressions.cpp
index a81e0cd5..b33f8972 100644
--- a/util/expressions.cpp
+++ b/util/expressions.cpp
@@ -42,6 +42,7 @@
 #include <sys/stat.h>
 #if !defined(_WIN32)
 #include <dirent.h>
+#include <fcntl.h>
 #include <unistd.h>
 #else
 // Windows support is probably very fragile
@@ -145,8 +146,9 @@ bool isIgnorable(const std::string &f) {
 #ifndef _WIN32
 void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     // Is our input path a file or a directory?
+    int fd = open(inPath.c_str(), O_RDONLY);
     struct stat st;
-    if (stat(inPath.c_str(), &st) != 0) {
+    if (fstat(fd, &st) != 0) {
         cerr << "Can't stat path: '" << inPath << "'" << endl;
         exit(1);
     }
@@ -159,7 +161,7 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
             exit(1);
         }
     } else if (S_ISDIR(st.st_mode)) {
-        DIR *d = opendir(inPath.c_str());
+        DIR *d = fdopendir(fd);
         if (d == nullptr) {
             cerr << "Can't open directory: '" << inPath << "'" << endl;
             exit(1);
@@ -188,11 +190,12 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
                 exit(1);
             }
         }
-        closedir(d);
+        (void)closedir(d);
     } else {
         cerr << "Can't stat path: '" << inPath << "'" << endl;
         exit(1);
     }
+    (void)close(fd);
 }
 #else // windows TODO: improve
 void HS_CDECL loadExpressions(const string &inPath, ExpressionMap &exprMap) {
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 19ab7edf..c5fad785 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -42,7 +42,6 @@
 #include "util/container.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 #include "util/unicode_def.h"
 #include "util/unicode_set.h"
@@ -52,6 +51,7 @@
 #include <memory>
 #include <set>
 #include <sstream>
+#include <unordered_set>
 #include <vector>
 
 #include <boost/utility.hpp>
@@ -143,7 +143,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
     vector<unique_ptr<VertexPath>> open;
     open.push_back(ue2::make_unique<VertexPath>(1, g.start));
 
-    ue2::unordered_set<NFAVertex> one_way_in;
+    unordered_set<NFAVertex> one_way_in;
     for (const auto &v : vertices_range(g)) {
         if (in_degree(v, g) <= 1) {
             one_way_in.insert(v);
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index 0a1f796f..0a896f73 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -41,6 +41,7 @@
 #include "util/compare.h"
 #include "util/report.h"
 #include "util/report_manager.h"
+#include "util/unordered.h"
 
 #include <algorithm>
 
@@ -752,12 +753,34 @@ bool operator==(const StateSet::State &a, const StateSet::State &b) {
            a.som == b.som;
 }
 
+/** \brief Cache to speed up edge lookups, rather than hitting the graph. */
+struct EdgeCache {
+    explicit EdgeCache(const NGHolder &g) {
+        cache.reserve(num_vertices(g));
+        for (auto e : edges_range(g)) {
+            cache.emplace(make_pair(source(e, g), target(e, g)), e);
+        }
+    }
+
+    NFAEdge get(NFAVertex u, NFAVertex v) const {
+        auto it = cache.find(make_pair(u, v));
+        if (it != cache.end()) {
+            return it->second;
+        }
+        return NFAEdge();
+    }
+
+private:
+    ue2_unordered_map<pair<NFAVertex, NFAVertex>, NFAEdge> cache;
+};
+
 struct fmstate {
     const size_t num_states; // number of vertices in graph
     StateSet states; // currently active states
     StateSet next; // states on after this iteration
     GraphCache &gc;
     vector<NFAVertex> vertices; // mapping from index to vertex
+    EdgeCache edge_cache;
     size_t offset = 0;
     unsigned char cur = 0;
     unsigned char prev = 0;
@@ -771,7 +794,7 @@ struct fmstate {
           states(num_states, edit_distance),
           next(num_states, edit_distance),
           gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()),
-          utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) {
+          edge_cache(g), utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) {
         // init states
         states.activateState(
                     StateSet::State {g[g.start].index, 0, 0,
@@ -889,7 +912,7 @@ void getAcceptMatches(const NGHolder &g, MatchSet &matches,
             eod ? state.gc.vertex_eod_reports_by_level[cur.level][u]
                 : state.gc.vertex_reports_by_level[cur.level][u];
 
-        NFAEdge e = edge(u, accept_vertex, g);
+        NFAEdge e = state.edge_cache.get(u, accept_vertex);
 
         // we assume edge assertions only exist at level 0
         if (e && !canReach(g, e, state)) {
@@ -965,7 +988,7 @@ void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) {
             } else {
                 // we assume edge assertions only exist on level 0
                 const CharReach &cr = g[v].char_reach;
-                NFAEdge e = edge(u, v, g);
+                NFAEdge e = state.edge_cache.get(u, v);
 
                 if (cr.test(state.cur) &&
                     (!e || canReach(g, e, state))) {