Merge branch develop into master

2025-10-03 13:03:51 +03:00 · 2017-09-22 15:20:28 +10:00
parent 7097ff3e63 3fa1236f09
commit 3dcd51c272
250 changed files with 10602 additions and 5305 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,17 @@

 This is a list of notable changes to Hyperscan, in reverse chronological order.

+## [4.6.0] 2017-09-22
+- New API feature: stream state compression. This allows the user to compress
+  and restore state for streams to reduce memory usage.
+- Many improvements to literal matching performance, including more support
+  for Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512).
+- Compile time improvements, mainly reducing compiler memory allocation.
+  Also results in reduced compile time for some pattern sets.
+- Bugfix for issue #62: fix error building Hyperscan using older versions of
+  Boost.
+- Small updates to fix warnings identified by Coverity.
+
 ## [4.5.2] 2017-07-26
 - Bugfix for issue #57: Treat characters between `\Q.\E` as codepoints in
  UTF8 mode.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (hyperscan C CXX)

 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 5)
-set (HS_PATCH_VERSION 2)
+set (HS_MINOR_VERSION 6)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -38,6 +38,7 @@ endif()

 set(BINDIR "${PROJECT_BINARY_DIR}/bin")
 set(LIBDIR "${PROJECT_BINARY_DIR}/lib")
+
 set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})

 # First for the generic no-config case
@@ -57,6 +58,11 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
    set(XCODE TRUE)
 endif()

+# older versions of cmake don't know things support isystem
+if (XCODE OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem")
+endif ()
+
 set(CMAKE_INCLUDE_CURRENT_DIR 1)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
@@ -148,8 +154,9 @@ if(MSVC OR MSVC_IDE)
            # todo: change these as required
            set(ARCH_C_FLAGS "/arch:AVX2")
            set(ARCH_CXX_FLAGS "/arch:AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /wd4244 /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 -D_CRT_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 ${MSVC_WARNS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
        endif()
        string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
        string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
@@ -248,7 +255,13 @@ else()
    endif()

    if(CMAKE_COMPILER_IS_GNUCXX)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0 -Wno-unused-local-typedefs -Wno-maybe-uninitialized")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
+        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+            set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
+        endif ()
+        # don't complain about abi
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
    endif()

    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
@@ -256,11 +269,6 @@ else()
        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
    endif()

-    if (RELEASE_BUILD)
-        # we don't need the noise of ABI warnings in a release build
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
-    endif ()

    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
        set(SKYLAKE_FLAG "-xCORE-AVX512")
@@ -396,18 +404,14 @@ if (CXX_MISSING_DECLARATIONS)
    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
 endif()

+CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
+
 # gcc5 complains about this
 CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)

 endif()

-if (NOT XCODE)
-    include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-else()
-    # cmake doesn't think Xcode supports isystem
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${Boost_INCLUDE_DIRS}")
-endif()
-
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})

 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    set(LINUX TRUE)
@@ -419,10 +423,10 @@ endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")

 if(NOT WIN32)
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable=remark")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable=remark")
 endif()
 if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable=remark")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable=remark")
 endif()
 endif()

@@ -513,6 +517,9 @@ set (hs_exec_SRCS
    src/crc32.h
    src/report.h
    src/runtime.c
+    src/stream_compress.c
+    src/stream_compress.h
+    src/stream_compress_impl.h
    src/fdr/fdr.c
    src/fdr/fdr.h
    src/fdr/fdr_internal.h
@@ -629,6 +636,7 @@ set (hs_exec_SRCS
    src/util/masked_move.h
    src/util/multibit.h
    src/util/multibit.c
+    src/util/multibit_compress.h
    src/util/multibit_internal.h
    src/util/pack_bits.h
    src/util/popcount.h
@@ -651,7 +659,7 @@ set (hs_exec_avx2_SRCS
 )


-SET (hs_SRCS
+SET (hs_compile_SRCS
    ${hs_HEADERS}
    src/crc32.h
    src/database.h
@@ -659,7 +667,6 @@ SET (hs_SRCS
    src/grey.h
    src/hs.cpp
    src/hs_internal.h
-    src/hs_version.c
    src/hs_version.h
    src/scratch.h
    src/state.h
@@ -735,6 +742,7 @@ SET (hs_SRCS
    src/nfa/nfa_build_util.h
    src/nfa/nfa_internal.h
    src/nfa/nfa_kind.h
+    src/nfa/rdfa.cpp
    src/nfa/rdfa.h
    src/nfa/rdfa_graph.cpp
    src/nfa/rdfa_graph.h
@@ -960,6 +968,7 @@ SET (hs_SRCS
    src/rose/rose_build_merge.cpp
    src/rose/rose_build_merge.h
    src/rose/rose_build_misc.cpp
+    src/rose/rose_build_misc.h
    src/rose/rose_build_program.cpp
    src/rose/rose_build_program.h
    src/rose/rose_build_resources.h
@@ -996,9 +1005,13 @@ SET (hs_SRCS
    src/util/dump_mask.h
    src/util/fatbit_build.cpp
    src/util/fatbit_build.h
+    src/util/flat_containers.h
    src/util/graph.h
+    src/util/graph_range.h
+    src/util/graph_small_color_map.h
    src/util/hash.h
    src/util/hash_dynamic_bitset.h
+    src/util/insertion_ordered.h
    src/util/math.h
    src/util/multibit_build.cpp
    src/util/multibit_build.h
@@ -1016,7 +1029,6 @@ SET (hs_SRCS
    src/util/small_vector.h
    src/util/target_info.cpp
    src/util/target_info.h
-    src/util/ue2_containers.h
    src/util/ue2_graph.h
    src/util/ue2string.cpp
    src/util/ue2string.h
@@ -1024,6 +1036,7 @@ SET (hs_SRCS
    src/util/unicode_def.h
    src/util/unicode_set.h
    src/util/uniform_ops.h
+    src/util/unordered.h
    src/util/verify_types.h
 )

@@ -1076,7 +1089,7 @@ set(hs_dump_SRCS
 )

 if (DUMP_SUPPORT)
-    set(hs_SRCS ${hs_SRCS} ${hs_dump_SRCS})
+    set(hs_compile_SRCS ${hs_compile_SRCS} ${hs_dump_SRCS})
 endif()

 # we group things by sublibraries, specifying shared and static and then
@@ -1099,12 +1112,20 @@ if (NOT FAT_RUNTIME)
        add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
        set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)

-        add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+        add_library(hs_compile OBJECT ${hs_compile_SRCS})
+
+        add_library(hs STATIC
+            src/hs_version.c
+            src/hs_valid_platform.c
+            $<TARGET_OBJECTS:hs_exec>
+            $<TARGET_OBJECTS:hs_compile>)
    endif (BUILD_STATIC_LIBS)

    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
        add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
        set_target_properties(hs_exec_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+        add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+        set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
    endif()

 else (FAT_RUNTIME)
@@ -1158,10 +1179,11 @@ else (FAT_RUNTIME)
           $<TARGET_OBJECTS:hs_exec_common>
           ${RUNTIME_LIBS})
       set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+        add_library(hs_compile OBJECT ${hs_compile_SRCS})

       # we want the static lib for testing
       add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
-           ${hs_SRCS}
+           $<TARGET_OBJECTS:hs_compile>
           $<TARGET_OBJECTS:hs_exec_common>
           ${RUNTIME_LIBS})

@@ -1169,6 +1191,8 @@ else (FAT_RUNTIME)

    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
        # build shared libs
+        add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+        set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
        add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
        set_target_properties(hs_exec_shared_core2 PROPERTIES
@@ -1249,10 +1273,10 @@ endif()
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
    if (NOT FAT_RUNTIME)
        add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
-            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
+            $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
    else()
        add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
-            ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common_shared>
+            $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_common_shared>
            ${RUNTIME_SHLIBS})

    endif()
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -64,7 +64,7 @@ libpcre are supported. The use of unsupported constructs will result in
 compilation errors.

 The version of PCRE used to validate Hyperscan's interpretation of this syntax
-is 8.40.
+is 8.41.

 ====================
 Supported Constructs
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -80,6 +80,42 @@ functions for the management of streams:
  another, resetting the destination stream first. This call avoids the
  allocation done by :c:func:`hs_copy_stream`.

+==================
+Stream Compression
+==================
+
+A stream object is allocated as a fixed size region of memory which has been
+sized to ensure that no memory allocations are required during scan
+operations. When the system is under memory pressure, it may be useful to reduce
+the memory consumed by streams that are not expected to be used soon. The
+Hyperscan API provides calls for translating a stream to and from a compressed
+representation for this purpose. The compressed representation differs from the
+full stream object as it does not reserve space for components which are not
+required given the current stream state. The Hyperscan API functions for this
+functionality are:
+
+* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
+  representation of the stream and returns the number of bytes consumed by the
+  compressed representation. If the buffer is not large enough to hold the
+  compressed representation, :c:member:`HS_INSUFFICIENT_SPACE` is returned along
+  with the required size. This call does not modify the original stream in any
+  way: it may still be written to with :c:func:`hs_scan_stream`, used as part of
+  the various reset calls to reinitialise its state, or
+  :c:func:`hs_close_stream` may be called to free its resources.
+
+* :c:func:`hs_expand_stream`: creates a new stream based on a buffer containing
+  a compressed representation.
+
+* :c:func:`hs_reset_and_expand_stream`: constructs a stream based on a buffer
+  containing a compressed representation on top of an existing stream, resetting
+  the existing stream first. This call avoids the allocation done by
+  :c:func:`hs_expand_stream`.
+
+Note: it is not recommended to use stream compression between every call to scan
+for performance reasons as it takes time to convert between the compressed
+representation and a standard stream.
+
+
 **********
 Block Mode
 **********
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -165,6 +165,7 @@ static bool higher_is_better(Criterion c) {
 }

 static void print_criterion(Criterion c, double val) {
+    std::ios::fmtflags f(cout.flags());
    switch (c) {
    case CRITERION_THROUGHPUT:
        cout << std::fixed << std::setprecision(3) << val << " Megabits/s";
@@ -179,6 +180,7 @@ static void print_criterion(Criterion c, double val) {
        cout << static_cast<size_t>(val) << " bytes";
        break;
    }
+    cout.flags(f);
 }

 // Key for identifying a stream in our pcap input data, using data from its IP
@@ -596,11 +598,13 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
    size_t bytes = bench.bytes();
    size_t matches = bench.matches();
    if (diagnose) {
+        std::ios::fmtflags f(cout.flags());
        cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
             << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
             << std::fixed << std::setprecision(3)
             << (bytes * 8 * repeatCount) / (scan_time * 1000000)
             << " Mbps, Matches " << matches << endl;
+        cout.flags(f);
    }
    return (bytes * 8 * repeatCount) / (scan_time * 1000000);
 }
@@ -755,10 +759,12 @@ int main(int argc, char **argv) {
        for (unsigned i = count; i < 16; i++) {
            cout << " ";
        }
+        std::ios::fmtflags out_f(cout.flags());
        cout << "Performance: ";
        print_criterion(criterion, best);
        cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
             << "x) after cutting:" << endl;
+        cout.flags(out_f);

        // s now has factor_max signatures
        for (const auto &found : s) {
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -127,6 +127,16 @@ CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
 CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
                const size_t length, size_t *deserialized_size);

+CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
+                char *buf, size_t buf_space, size_t *used_space);
+
+CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
+                hs_stream_t **stream, const char *buf,size_t buf_size);
+
+CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
+                const char *buf, size_t buf_size, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+
 /** INTERNALS **/

 CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -32,6 +32,7 @@
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
 #include "flood_runtime.h"
+#include "scratch.h"
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/arch.h"
@@ -358,7 +359,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
-                    last_match_id, confVal);
+                    last_match_id, confVal, conf, bit);
    } while (unlikely(!!*conf));
 }

@@ -725,13 +726,17 @@ static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                             const struct FDR_Runtime_Args *a,
                             hwlm_group_t control) {
+    assert(ISALIGNED_CL(fdr));
+
    u32 floodBackoff = FLOOD_BACKOFF_START;
    u32 last_match_id = INVALID_MATCH_ID;
    u32 domain_mask_flipped = ~fdr->domainMask;
    u8 stride = fdr->stride;
    const u64a *ft =
-        (const u64a *)((const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)));
-    const u32 *confBase = (const u32 *)((const u8 *)ft + fdr->tabSize);
+        (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
+    assert(ISALIGNED_CL(ft));
+    const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
+    assert(ISALIGNED_CL(confBase));
    struct zone zones[ZONE_MAX];
    assert(fdr->domain > 8 && fdr->domain < 16);

@@ -798,14 +803,14 @@ static const FDRFUNCTYPE funcs[] = {
    fdr_engine_exec,
    NULL, /* old: fast teddy */
    NULL, /* old: fast teddy */
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4),
+    ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
    fdr_exec_teddy_msks1,
    fdr_exec_teddy_msks1_pck,
    fdr_exec_teddy_msks2,
@@ -820,8 +825,8 @@ static const FDRFUNCTYPE funcs[] = {
 static const u8 fake_history[FAKE_HISTORY_SIZE];

 hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
-                     size_t start, HWLMCallback cb, void *ctxt,
-                     hwlm_group_t groups) {
+                     size_t start, HWLMCallback cb,
+                     struct hs_scratch *scratch, hwlm_group_t groups) {
    // We guarantee (for safezone construction) that it is safe to read 16
    // bytes before the end of the history buffer.
    const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
@@ -833,7 +838,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
        0,
        start,
        cb,
-        ctxt,
+        scratch,
        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
        0
    };
@@ -847,7 +852,8 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,

 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                              size_t hlen, const u8 *buf, size_t len,
-                              size_t start, HWLMCallback cb, void *ctxt,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
                              hwlm_group_t groups) {
    struct FDR_Runtime_Args a = {
        buf,
@@ -856,7 +862,7 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
        hlen,
        start,
        cb,
-        ctxt,
+        scratch,
        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
        /* we are guaranteed to always have 16 initialised bytes at the end of
         * the history buffer (they may be garbage). */
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@ extern "C" {
 #endif

 struct FDR;
+struct hs_scratch;

 /**
 * \brief Block-mode scan.
@@ -49,13 +50,13 @@ struct FDR;
 * \param fdr FDR matcher engine.
 * \param buf Buffer to scan.
 * \param len Length of buffer to scan.
- * \param start First offset in buf at which a match may end.
+ * \param start First offset in buf at which a match may start.
 * \param cb Callback to call when a match is found.
- * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param scratch Scratch supplied to callback on match.
 * \param groups Initial groups mask.
 */
 hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
-                     size_t start, HWLMCallback cb, void *ctxt,
+                     size_t start, HWLMCallback cb, struct hs_scratch *scratch,
                     hwlm_group_t groups);

 /**
@@ -66,14 +67,15 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
 * \param hlen Length of history buffer (hbuf).
 * \param buf Buffer to scan.
 * \param len Length of buffer to scan (buf).
- * \param start First offset in buf at which a match may end.
+ * \param start First offset in buf at which a match may start.
 * \param cb Callback to call when a match is found.
- * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param scratch Scratch supplied to callback on match.
 * \param groups Initial groups mask.
 */
 hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
                              size_t hlen, const u8 *buf, size_t len,
-                              size_t start, HWLMCallback cb, void *ctxt,
+                              size_t start, HWLMCallback cb,
+                              struct hs_scratch *scratch,
                              hwlm_group_t groups);

 #ifdef __cplusplus
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -42,7 +42,9 @@
 #include "ue2common.h"
 #include "hwlm/hwlm_build.h"
 #include "util/compare.h"
+#include "util/container.h"
 #include "util/dump_mask.h"
+#include "util/make_unique.h"
 #include "util/math.h"
 #include "util/noncopyable.h"
 #include "util/target_info.h"
@@ -50,6 +52,7 @@
 #include "util/verify_types.h"

 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cctype>
 #include <cstdio>
@@ -61,6 +64,8 @@
 #include <numeric>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include <boost/multi_array.hpp>
@@ -81,7 +86,6 @@ private:
    bool make_small;

    u8 *tabIndexToMask(u32 indexInTable);
-    void assignStringsToBuckets();
 #ifdef DEBUG
    void dumpMasks(const u8 *defaultMask);
 #endif
@@ -90,10 +94,13 @@ private:
    void createInitialState(FDR *fdr);

 public:
-    FDRCompiler(vector<hwlmLiteral> lits_in, const FDREngineDescription &eng_in,
+    FDRCompiler(vector<hwlmLiteral> lits_in,
+                map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in,
+                const FDREngineDescription &eng_in,
                bool make_small_in, const Grey &grey_in)
        : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
-          lits(move(lits_in)), make_small(make_small_in) {}
+          lits(move(lits_in)), bucketToLits(move(bucketToLits_in)),
+          make_small(make_small_in) {}

    bytecode_ptr<FDR> build();
 };
@@ -144,61 +151,139 @@ void FDRCompiler::createInitialState(FDR *fdr) {
    }
 }

+/**
+ * \brief Lay out FDR structures in bytecode.
+ *
+ * Note that each major structure (header, table, confirm, flood control) is
+ * cacheline-aligned.
+ */
 bytecode_ptr<FDR> FDRCompiler::setupFDR() {
+    auto floodTable = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+    size_t headerSize = sizeof(FDR);
    size_t tabSize = eng.getTabSizeBytes();

-    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
-    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
-
-    assert(ISALIGNED_16(tabSize));
-    assert(ISALIGNED_16(confirmTmp.size()));
-    assert(ISALIGNED_16(floodControlTmp.size()));
-    size_t headerSize = ROUNDUP_16(sizeof(FDR));
-    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.size() +
-                             floodControlTmp.size());
+    // Note: we place each major structure here on a cacheline boundary.
+    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(tabSize) +
+                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();

    DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
                 "total=%zu\n",
-                 headerSize, tabSize, confirmTmp.size(), floodControlTmp.size(),
+                 headerSize, tabSize, confirmTable.size(), floodTable.size(),
                 size);

    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
    assert(fdr); // otherwise would have thrown std::bad_alloc

+    u8 *fdr_base = (u8 *)fdr.get();
+
+    // Write header.
    fdr->size = size;
    fdr->engineID = eng.getID();
    fdr->maxStringLen = verify_u32(maxLen(lits));
-    createInitialState(fdr.get());
-
-    u8 *fdr_base = (u8 *)fdr.get();
-    u8 *ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
-    copy(tab.begin(), tab.end(), ptr);
-    ptr += tabSize;
-
-    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
-    ptr += confirmTmp.size();
-
-    fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
-    ptr += floodControlTmp.size();
-
-    /*  we are allowing domains 9 to 15 only */
-    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->numStrings = verify_u32(lits.size());
+    assert(eng.bits > 8 && eng.bits < 16); // we allow domains 9 to 15 only
    fdr->domain = eng.bits;
    fdr->domainMask = (1 << eng.bits) - 1;
-    fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
+    fdr->tabSize = tabSize;
    fdr->stride = eng.stride;
+    createInitialState(fdr.get());
+
+    // Write table.
+    u8 *ptr = fdr_base + ROUNDUP_CL(sizeof(FDR));
+    assert(ISALIGNED_CL(ptr));
+    copy(tab.begin(), tab.end(), ptr);
+    ptr += ROUNDUP_CL(tabSize);
+
+    // Write confirm structures.
+    assert(ISALIGNED_CL(ptr));
+    fdr->confOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, confirmTable.get(), confirmTable.size());
+    ptr += ROUNDUP_CL(confirmTable.size());
+
+    // Write flood control structures.
+    assert(ISALIGNED_CL(ptr));
+    fdr->floodOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, floodTable.get(), floodTable.size());
+    ptr += floodTable.size(); // last write, no need to round up

    return fdr;
 }

 //#define DEBUG_ASSIGNMENT

-static
-double getScoreUtil(u32 len, u32 count) {
-    return len == 0 ? numeric_limits<double>::max()
-                    : our_pow(count, 1.05) * our_pow(len, -3.0);
-}
+/**
+ * Utility class for computing:
+ *
+ *    score(count, len) = pow(count, 1.05) * pow(len, -3)
+ *
+ * Calling pow() is expensive. This is mitigated by using pre-computed LUTs for
+ * small inputs and a cache for larger ones.
+ */
+class Scorer {
+    unordered_map<u32, double> count_factor_cache;
+
+    // LUT: pow(count, 1.05) for small values of count.
+    static const array<double, 100> count_lut;
+
+    double count_factor(u32 count) {
+        if (count < count_lut.size()) {
+            return count_lut[count];
+        }
+
+        auto it = count_factor_cache.find(count);
+        if (it != count_factor_cache.end()) {
+            return it->second;
+        }
+        double r = our_pow(count, 1.05);
+        count_factor_cache.emplace(count, r);
+        return r;
+    }
+
+    // LUT: pow(len, -3) for len in range [0,8].
+    static const array<double, 9> len_lut;
+
+    double len_factor(u32 len) {
+        assert(len <= len_lut.size());
+        return len_lut[len];
+    }
+
+public:
+    double operator()(u32 len, u32 count) {
+        if (len == 0) {
+            return numeric_limits<double>::max();
+        }
+        return count_factor(count) * len_factor(len);
+    }
+};
+
+const array<double, 100> Scorer::count_lut{{
+    pow(0, 1.05),  pow(1, 1.05),  pow(2, 1.05),  pow(3, 1.05),  pow(4, 1.05),
+    pow(5, 1.05),  pow(6, 1.05),  pow(7, 1.05),  pow(8, 1.05),  pow(9, 1.05),
+    pow(10, 1.05), pow(11, 1.05), pow(12, 1.05), pow(13, 1.05), pow(14, 1.05),
+    pow(15, 1.05), pow(16, 1.05), pow(17, 1.05), pow(18, 1.05), pow(19, 1.05),
+    pow(20, 1.05), pow(21, 1.05), pow(22, 1.05), pow(23, 1.05), pow(24, 1.05),
+    pow(25, 1.05), pow(26, 1.05), pow(27, 1.05), pow(28, 1.05), pow(29, 1.05),
+    pow(30, 1.05), pow(31, 1.05), pow(32, 1.05), pow(33, 1.05), pow(34, 1.05),
+    pow(35, 1.05), pow(36, 1.05), pow(37, 1.05), pow(38, 1.05), pow(39, 1.05),
+    pow(40, 1.05), pow(41, 1.05), pow(42, 1.05), pow(43, 1.05), pow(44, 1.05),
+    pow(45, 1.05), pow(46, 1.05), pow(47, 1.05), pow(48, 1.05), pow(49, 1.05),
+    pow(50, 1.05), pow(51, 1.05), pow(52, 1.05), pow(53, 1.05), pow(54, 1.05),
+    pow(55, 1.05), pow(56, 1.05), pow(57, 1.05), pow(58, 1.05), pow(59, 1.05),
+    pow(60, 1.05), pow(61, 1.05), pow(62, 1.05), pow(63, 1.05), pow(64, 1.05),
+    pow(65, 1.05), pow(66, 1.05), pow(67, 1.05), pow(68, 1.05), pow(69, 1.05),
+    pow(70, 1.05), pow(71, 1.05), pow(72, 1.05), pow(73, 1.05), pow(74, 1.05),
+    pow(75, 1.05), pow(76, 1.05), pow(77, 1.05), pow(78, 1.05), pow(79, 1.05),
+    pow(80, 1.05), pow(81, 1.05), pow(82, 1.05), pow(83, 1.05), pow(84, 1.05),
+    pow(85, 1.05), pow(86, 1.05), pow(87, 1.05), pow(88, 1.05), pow(89, 1.05),
+    pow(90, 1.05), pow(91, 1.05), pow(92, 1.05), pow(93, 1.05), pow(94, 1.05),
+    pow(95, 1.05), pow(96, 1.05), pow(97, 1.05), pow(98, 1.05), pow(99, 1.05),
+}};
+
+const array<double, 9> Scorer::len_lut{{
+    pow(0, -3.0), pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0),
+    pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}};

 /**
 * Returns true if the two given literals should be placed in the same chunk as
@@ -297,7 +382,10 @@ next_literal:
    return chunks;
 }

-void FDRCompiler::assignStringsToBuckets() {
+static
+map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets(
+                                    vector<hwlmLiteral> &lits,
+                                    const FDREngineDescription &eng) {
    const double MAX_SCORE = numeric_limits<double>::max();

    assert(!lits.empty()); // Shouldn't be called with no literals.
@@ -340,12 +428,14 @@ void FDRCompiler::assignStringsToBuckets() {
    boost::multi_array<pair<double, u32>, 2> t(
        boost::extents[numChunks][numBuckets]);

+    Scorer scorer;
+
    for (u32 j = 0; j < numChunks; j++) {
        u32 cnt = 0;
        for (u32 k = j; k < numChunks; ++k) {
            cnt += chunks[k].count;
        }
-        t[j][0] = {getScoreUtil(chunks[j].length, cnt), 0};
+        t[j][0] = {scorer(chunks[j].length, cnt), 0};
    }

    for (u32 i = 1; i < numBuckets; i++) {
@@ -353,7 +443,7 @@ void FDRCompiler::assignStringsToBuckets() {
            pair<double, u32> best = {MAX_SCORE, 0};
            u32 cnt = chunks[j].count;
            for (u32 k = j + 1; k < numChunks - 1; k++) {
-                auto score = getScoreUtil(chunks[j].length, cnt);
+                auto score = scorer(chunks[j].length, cnt);
                if (score > best.first) {
                    break; // now worse locally than our best score, give up
                }
@@ -381,6 +471,7 @@ void FDRCompiler::assignStringsToBuckets() {

    // our best score is in t[0][N_BUCKETS-1] and we can follow the links
    // to find where our buckets should start and what goes into them
+    vector<vector<LiteralIndex>> buckets;
    for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) {
        u32 j = t[i][n - 1].second;
        if (j == 0) {
@@ -391,21 +482,33 @@ void FDRCompiler::assignStringsToBuckets() {
        u32 first_id = chunks[i].first_id;
        u32 last_id = chunks[j].first_id;
        assert(first_id < last_id);
-        u32 bucket = numBuckets - n;
        UNUSED const auto &first_lit = lits[first_id];
        UNUSED const auto &last_lit = lits[last_id - 1];
-        DEBUG_PRINTF("placing [%u-%u) in bucket %u (%u lits, len %zu-%zu, "
-                      "score %0.4f)\n",
-                      first_id, last_id, bucket, last_id - first_id,
-                      first_lit.s.length(), last_lit.s.length(),
-                      getScoreUtil(first_lit.s.length(), last_id - first_id));
+        DEBUG_PRINTF("placing [%u-%u) in one bucket (%u lits, len %zu-%zu, "
+                     "score %0.4f)\n",
+                     first_id, last_id, last_id - first_id,
+                     first_lit.s.length(), last_lit.s.length(),
+                     scorer(first_lit.s.length(), last_id - first_id));

-        auto &bucket_lits = bucketToLits[bucket];
-        for (u32 k = first_id; k < last_id; k++) {
-            bucket_lits.push_back(k);
+        vector<LiteralIndex> litIds;
+        u32 cnt = last_id - first_id;
+        // long literals first for included literals checking
+        for (u32 k = 0; k < cnt; k++) {
+            litIds.push_back(last_id - k - 1);
        }
+
        i = j;
+        buckets.push_back(litIds);
    }
+
+    // reverse bucket id, longer literals come first
+    map<BucketIndex, vector<LiteralIndex>> bucketToLits;
+    size_t bucketCnt = buckets.size();
+    for (size_t i = 0; i < bucketCnt; i++) {
+        bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i]));
+    }
+
+    return bucketToLits;
 }

 #ifdef DEBUG
@@ -426,7 +529,7 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                               const vector<LiteralIndex> &vl,
                               const vector<hwlmLiteral> &lits,
                               SuffixPositionInString pos,
-                               std::map<u32, ue2::unordered_set<u32> > &m2) {
+                               map<u32, unordered_set<u32>> &m2) {
    assert(eng.bits < 32);

    u32 distance = 0;
@@ -497,7 +600,7 @@ void FDRCompiler::setupTab() {
        SuffixPositionInString pLimit = eng.getBucketWidth(b);
        for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
            u32 bit = eng.getSchemeBit(b, pos);
-            map<u32, ue2::unordered_set<u32>> m2;
+            map<u32, unordered_set<u32>> m2;
            bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
            if (done) {
                clearbit(&defaultMask[0], bit);
@@ -505,7 +608,7 @@ void FDRCompiler::setupTab() {
            }
            for (const auto &elem : m2) {
                u32 dc = elem.first;
-                const ue2::unordered_set<u32> &mskSet = elem.second;
+                const unordered_set<u32> &mskSet = elem.second;
                u32 v = ~dc;
                do {
                    u32 b2 = v & dc;
@@ -529,24 +632,222 @@ void FDRCompiler::setupTab() {
 }

 bytecode_ptr<FDR> FDRCompiler::build() {
-    assignStringsToBuckets();
    setupTab();
    return setupFDR();
 }

+static
+bool isSuffix(const hwlmLiteral &lit1, const hwlmLiteral &lit2) {
+    const auto &s1 = lit1.s;
+    const auto &s2 = lit2.s;
+    size_t len1 = s1.length();
+    size_t len2 = s2.length();
+    assert(len1 >= len2);
+
+    if (lit1.nocase || lit2.nocase) {
+        return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2,
+            [](char a, char b) { return mytoupper(a) == mytoupper(b); });
+    } else {
+        return equal(s2.begin(), s2.end(), s1.begin() + len1 - len2);
+    }
+}
+
+/*
+ * if lit2 is a suffix of lit1 but the case sensitivity, groups or mask info
+ * of lit2 is a subset of lit1, then lit1 can't squash lit2 and lit2 can
+ * possibly match when lit1 matches. In this case, we can't do bucket
+ * squashing. e.g. AAA(no case) in bucket 0, AA(no case) and aa in bucket 1,
+ * we can't squash bucket 1 if we have input like "aaa" as aa can also match.
+ */
+static
+bool includedCheck(const hwlmLiteral &lit1, const hwlmLiteral &lit2) {
+    /* lit1 is caseless and lit2 is case sensitive */
+    if ((lit1.nocase && !lit2.nocase)) {
+        return true;
+    }
+
+    /* lit2's group is a subset of lit1 */
+    if (lit1.groups != lit2.groups &&
+        (lit2.groups == (lit1.groups & lit2.groups))) {
+        return true;
+    }
+
+    /* TODO: narrow down cases for mask check */
+    if (lit1.cmp != lit2.cmp || lit1.msk != lit2.msk) {
+        return true;
+    }
+
+    return false;
+}
+
+/*
+ * if lit2 is an included literal of both lit0 and lit1, then lit0 and lit1
+ * shouldn't match at the same offset, otherwise we give up squashing for lit1.
+ * e.g. lit0:AAA(no case), lit1:aa, lit2:A(no case). We can have duplicate
+ * matches for input "aaa" if lit0 and lit1 both squash lit2.
+ */
+static
+bool checkParentLit(
+            const vector<hwlmLiteral> &lits, u32 pos1,
+            const unordered_set<u32> &parent_map,
+            const unordered_map<u32, unordered_set<u32>> &exception_map) {
+    assert(pos1 < lits.size());
+    const auto &lit1 = lits[pos1];
+    for (const auto pos2 : parent_map) {
+        if (contains(exception_map, pos2)) {
+            const auto &exception_pos = exception_map.at(pos2);
+            if (contains(exception_pos, pos1)) {
+                return false;
+            }
+        }
+
+        /* if lit1 isn't an exception of lit2, then we have to do further
+         * exclusive check.
+         * TODO: More mask checks. Note if two literals are group exclusive,
+         * it is possible that they match at the same offset. */
+        assert(pos2 < lits.size());
+        const auto &lit2 = lits[pos2];
+        if (isSuffix(lit2, lit1)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static
+void buildSquashMask(vector<hwlmLiteral> &lits, u32 id1, u32 bucket1,
+                     size_t start, const vector<pair<u32, u32>> &group,
+                     unordered_map<u32, unordered_set<u32>> &parent_map,
+                     unordered_map<u32, unordered_set<u32>> &exception_map) {
+    auto &lit1 = lits[id1];
+    DEBUG_PRINTF("b:%u len:%zu\n", bucket1, lit1.s.length());
+
+    size_t cnt = group.size();
+    bool included = false;
+    bool exception = false;
+    u32 child_id = ~0U;
+    for (size_t i = start; i < cnt; i++) {
+        u32 bucket2 = group[i].first;
+        assert(bucket2 >= bucket1);
+
+        u32 id2 = group[i].second;
+        auto &lit2 = lits[id2];
+        // check if lit2 is a suffix of lit1
+        if (isSuffix(lit1, lit2)) {
+            /* if we have a included literal in the same bucket,
+             * quit and let the included literal to do possible squashing */
+            if (bucket1 == bucket2) {
+                DEBUG_PRINTF("same bucket\n");
+                return;
+            }
+            /* if lit2 is a suffix but doesn't pass included checks for
+             * extra info, we give up sqaushing */
+            if (includedCheck(lit1, lit2)) {
+                DEBUG_PRINTF("find exceptional suffix %u\n", lit2.id);
+                exception_map[id1].insert(id2);
+                exception = true;
+            } else if (checkParentLit(lits, id1, parent_map[id2],
+                       exception_map)) {
+                if (lit1.included_id == INVALID_LIT_ID) {
+                    DEBUG_PRINTF("find suffix lit1 %u lit2 %u\n",
+                                 lit1.id, lit2.id);
+                    lit1.included_id = lit2.id;
+                } else {
+                    /* if we have multiple included literals in one bucket,
+                     * give up squashing. */
+                    DEBUG_PRINTF("multiple included literals\n");
+                    lit1.included_id = INVALID_LIT_ID;
+                    return;
+                }
+                child_id = id2;
+                included = true;
+            }
+        }
+
+        size_t next = i + 1;
+        u32 nextBucket = next < cnt ? group[next].first : ~0U;
+        if (bucket2 != nextBucket) {
+            if (included) {
+                if (exception) {
+                    /* give up if we have exception literals
+                     * in the same bucket as the included literal. */
+                    lit1.included_id = INVALID_LIT_ID;
+                } else {
+                    parent_map[child_id].insert(id1);
+
+                    lit1.squash |= 1U << bucket2;
+                    DEBUG_PRINTF("build squash mask %2x for %u\n",
+                                 lit1.squash, lit1.id);
+                }
+                return;
+            }
+            exception = false;
+        }
+    }
+}
+
+static constexpr u32 INCLUDED_LIMIT = 1000;
+
+static
+void findIncludedLits(vector<hwlmLiteral> &lits,
+                      const vector<vector<pair<u32, u32>>> &lastCharMap) {
+    /* Map for finding the positions of literal which includes a literal
+     * in FDR hwlm literal vector. */
+    unordered_map<u32, unordered_set<u32>> parent_map;
+
+    /* Map for finding the positions of exception literals which could
+     * sometimes match if a literal matches in FDR hwlm literal vector. */
+    unordered_map<u32, unordered_set<u32>> exception_map;
+    for (const auto &group : lastCharMap) {
+        size_t cnt = group.size();
+        if (cnt > INCLUDED_LIMIT) {
+            continue;
+        }
+        for (size_t i = 0; i < cnt; i++) {
+            u32 bucket1 = group[i].first;
+            u32 id1 = group[i].second;
+            buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map,
+                            exception_map);
+        }
+    }
+}
+
+static
+void addIncludedInfo(
+               vector<hwlmLiteral> &lits, u32 nBuckets,
+               map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
+    vector<vector<pair<u32, u32>>> lastCharMap(256);
+
+    for (BucketIndex b = 0; b < nBuckets; b++) {
+        if (!bucketToLits[b].empty()) {
+            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+                const auto &lit = lits[lit_idx];
+                u8 c = mytoupper(lit.s.back());
+                lastCharMap[c].emplace_back(b, lit_idx);
+            }
+        }
+    }
+
+    findIncludedLits(lits, lastCharMap);
+}
+
 } // namespace

 static
-bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
-                                        bool make_small, const target_t &target,
-                                        const Grey &grey, u32 hint) {
+unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType,
+                                            vector<hwlmLiteral> &lits,
+                                            bool make_small,
+                                            const target_t &target,
+                                            const Grey &grey, u32 hint) {
    DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");

    if (grey.fdrAllowTeddy) {
-        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, grey);
-        if (fdr) {
+        auto proto = teddyBuildProtoHinted(engType, lits, make_small, hint,
+                                           target);
+        if (proto) {
            DEBUG_PRINTF("build with teddy succeeded\n");
-            return fdr;
+            return proto;
        } else {
            DEBUG_PRINTF("build with teddy failed, will try with FDR\n");
        }
@@ -564,23 +865,47 @@ bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
        des->stride = 1;
    }

-    FDRCompiler fc(lits, *des, make_small, grey);
+    auto bucketToLits = assignStringsToBuckets(lits, *des);
+    addIncludedInfo(lits, des->getNumBuckets(), bucketToLits);
+    auto proto =
+        ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits,
+                                    make_small);
+    return proto;
+}
+
+unique_ptr<HWLMProto> fdrBuildProto(u8 engType, vector<hwlmLiteral> lits,
+                                    bool make_small, const target_t &target,
+                                    const Grey &grey) {
+    return fdrBuildProtoInternal(engType, lits, make_small, target, grey,
+                                 HINT_INVALID);
+}
+
+static
+bytecode_ptr<FDR> fdrBuildTableInternal(const HWLMProto &proto,
+                                        const Grey &grey) {
+
+    if (proto.teddyEng) {
+        return teddyBuildTable(proto, grey);
+    }
+
+    FDRCompiler fc(proto.lits, proto.bucketToLits, *(proto.fdrEng),
+                   proto.make_small, grey);
    return fc.build();
 }

-bytecode_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
-                                bool make_small, const target_t &target,
-                                const Grey &grey) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID);
+bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey) {
+    return fdrBuildTableInternal(proto, grey);
 }

 #if !defined(RELEASE_BUILD)

-bytecode_ptr<FDR> fdrBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                      bool make_small, u32 hint,
-                                      const target_t &target,
-                                      const Grey &grey) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, hint);
+unique_ptr<HWLMProto> fdrBuildProtoHinted(u8 engType,
+                                          vector<hwlmLiteral> lits,
+                                          bool make_small, u32 hint,
+                                          const target_t &target,
+                                          const Grey &grey) {
+    return fdrBuildProtoInternal(engType, lits, make_small, target, grey,
+                                 hint);
 }

 #endif
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@@ -34,6 +34,7 @@
 #define FDR_COMPILE_H

 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/bytecode_ptr.h"

 #include <vector>
@@ -46,18 +47,23 @@ struct hwlmLiteral;
 struct Grey;
 struct target_t;

-bytecode_ptr<FDR> fdrBuildTable(const std::vector<hwlmLiteral> &lits,
-                                bool make_small, const target_t &target,
-                                const Grey &grey);
+bytecode_ptr<FDR> fdrBuildTable(const HWLMProto &proto, const Grey &grey);

 #if !defined(RELEASE_BUILD)
-
-bytecode_ptr<FDR> fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits,
-                                      bool make_small, u32 hint,
-                                      const target_t &target, const Grey &grey);
-
+std::unique_ptr<HWLMProto> fdrBuildProtoHinted(
+                                          u8 engType,
+                                          std::vector<hwlmLiteral> lits,
+                                          bool make_small, u32 hint,
+                                          const target_t &target,
+                                          const Grey &grey);
 #endif

+std::unique_ptr<HWLMProto> fdrBuildProto(
+                                     u8 engType,
+                                     std::vector<hwlmLiteral> lits,
+                                     bool make_small, const target_t &target,
+                                     const Grey &grey);
+
 /** \brief Returns size in bytes of the given FDR engine. */
 size_t fdrSize(const struct FDR *fdr);

--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -57,10 +57,11 @@ class FDREngineDescription;
 struct hwlmStreamingControl;
 struct Grey;

-bytecode_ptr<u8> setupFullConfs(const std::vector<hwlmLiteral> &lits,
-               const EngineDescription &eng,
-               std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
-               bool make_small);
+bytecode_ptr<u8> setupFullConfs(
+      const std::vector<hwlmLiteral> &lits,
+      const EngineDescription &eng,
+      const std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
+      bool make_small);

 // all suffixes include an implicit max_bucket_width suffix to ensure that
 // we always read a full-scale flood "behind" us in terms of what's in our
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -42,12 +42,11 @@ u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) {
 #define CONF_TYPE u64a
 #define CONF_HASH_CALL mul_hash_64

-typedef enum LitInfoFlags {
-    NoFlags = 0,
-    Caseless = 1,
-    NoRepeat = 2,
-    ComplexConfirm = 4
-} LitInfoFlags;
+/**
+ * \brief Flag indicating this literal doesn't need to be delivered more than
+ * once, used in LitInfo::flags.
+ */
+#define FDR_LIT_FLAG_NOREPEAT   1

 /**
 * \brief Structure describing a literal, linked to by FDRConfirm.
@@ -61,12 +60,12 @@ struct LitInfo {
    hwlm_group_t groups;
    u32 id; // literal ID as passed in
    u8 size;
-    u8 flags; /* LitInfoFlags */
+    u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above.
    u8 next;
-    u8 extended_size;
 };

 #define FDRC_FLAG_NO_CONFIRM 1
+#define FDRC_FLAG_NOREPEAT   2

 /**
 * \brief FDR confirm header.
@@ -79,12 +78,8 @@ struct LitInfo {
 struct FDRConfirm {
    CONF_TYPE andmsk;
    CONF_TYPE mult;
-    u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
-    u32 flags;  // sole meaning is 'non-zero means no-confirm' (that is all)
+    u32 nBits;
    hwlm_group_t groups;
-    u32 soleLitSize;
-    u32 soleLitCmp;
-    u32 soleLitMsk;
 };

 static really_inline
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -35,6 +35,7 @@
 #include "util/alloc.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
+#include "util/container.h"
 #include "util/verify_types.h"

 #include <algorithm>
@@ -47,19 +48,6 @@ namespace ue2 {

 using BC2CONF = map<BucketIndex, bytecode_ptr<FDRConfirm>>;

-// return the number of bytes beyond a length threshold in all strings in lits
-static
-size_t thresholdedSize(const vector<hwlmLiteral> &lits, size_t threshold) {
-    size_t tot = 0;
-    for (const auto &lit : lits) {
-        size_t sz = lit.s.size();
-        if (sz > threshold) {
-            tot += ROUNDUP_N(sz - threshold, 8);
-        }
-    }
-    return tot;
-}
-
 static
 u64a make_u64a_mask(const vector<u8> &v) {
    assert(v.size() <= sizeof(u64a));
@@ -92,19 +80,12 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
        LitInfo &info = tmpLitInfo[i];
        memset(&info, 0, sizeof(info));
        info.id = lit.id;
-        u8 flags = NoFlags;
-        if (lit.nocase) {
-            flags |= Caseless;
-        }
+        u8 flags = 0;
        if (lit.noruns) {
-            flags |= NoRepeat;
-        }
-        if (lit.msk.size() > lit.s.size()) {
-            flags |= ComplexConfirm;
-            info.extended_size = verify_u8(lit.msk.size());
+            flags |= FDR_LIT_FLAG_NOREPEAT;
        }
        info.flags = flags;
-        info.size = verify_u8(lit.s.size());
+        info.size = verify_u8(max(lit.msk.size(), lit.s.size()));
        info.groups = lit.groups;

        // these are built up assuming a LE machine
@@ -149,7 +130,12 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,

 static
 bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
-                                       bool make_small, bool make_confirm) {
+                                       bool make_small) {
+    // Every literal must fit within CONF_TYPE.
+    assert(all_of_in(lits, [](const hwlmLiteral &lit) {
+        return lit.s.size() <= sizeof(CONF_TYPE);
+    }));
+
    vector<LitInfo> tmpLitInfo(lits.size());
    CONF_TYPE andmsk;
    fillLitInfo(lits, tmpLitInfo, andmsk);
@@ -167,40 +153,6 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
    }

    CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
-    u32 flags = 0;
-    // we use next three variables for 'confirmless' case to speed-up
-    // confirmation process
-    u32 soleLitSize = 0;
-    u32 soleLitCmp = 0;
-    u32 soleLitMsk = 0;
-
-    if (!make_confirm) {
-        flags = FDRC_FLAG_NO_CONFIRM;
-        if (lits[0].noruns) {
-            flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
-        }
-        mult = 0;
-        soleLitSize = lits[0].s.size() - 1;
-        // we can get to this point only in confirmless case;
-        // it means that we have only one literal per FDRConfirm (no packing),
-        // with no literal mask and size of literal is less or equal
-        // to the number of masks of Teddy engine;
-        // maximum number of masks for Teddy is 4, so the size of
-        // literal is definitely less or equal to size of u32
-        assert(lits[0].s.size() <= sizeof(u32));
-        for (u32 i = 0; i < lits[0].s.size(); i++) {
-            u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
-            u8 c = lits[0].s[lits[0].s.size() - i - 1];
-            if (lits[0].nocase && ourisalpha(c)) {
-                soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
-                soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
-            }
-            else {
-                soleLitCmp |= (u32)c << shiftLoc;
-                soleLitMsk |= (u32)0xff << shiftLoc;
-            }
-        }
-    }

    // we can walk the vector and assign elements from the vectors to a
    // map by hash value
@@ -276,12 +228,11 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 #endif

    const size_t bitsToLitIndexSize = (1U << nBits) * sizeof(u32);
-    const size_t totalLitSize = thresholdedSize(lits, sizeof(CONF_TYPE));

    // this size can now be a worst-case as we can always be a bit smaller
    size_t size = ROUNDUP_N(sizeof(FDRConfirm), alignof(u32)) +
                  ROUNDUP_N(bitsToLitIndexSize, alignof(LitInfo)) +
-                  sizeof(LitInfo) * lits.size() + totalLitSize;
+                  sizeof(LitInfo) * lits.size();
    size = ROUNDUP_N(size, alignof(FDRConfirm));

    auto fdrc = make_zeroed_bytecode_ptr<FDRConfirm>(size);
@@ -289,11 +240,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,

    fdrc->andmsk = andmsk;
    fdrc->mult = mult;
-    fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
-    fdrc->flags = flags;
-    fdrc->soleLitSize = soleLitSize;
-    fdrc->soleLitCmp = soleLitCmp;
-    fdrc->soleLitMsk = soleLitMsk;
+    fdrc->nBits = nBits;

    fdrc->groups = gm;

@@ -345,40 +292,37 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 bytecode_ptr<u8>
 setupFullConfs(const vector<hwlmLiteral> &lits,
               const EngineDescription &eng,
-               map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+               const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
               bool make_small) {
-    bool makeConfirm = true;
    unique_ptr<TeddyEngineDescription> teddyDescr =
        getTeddyDescription(eng.getID());
-    if (teddyDescr) {
-        makeConfirm = teddyDescr->needConfirm(lits);
-    }

    BC2CONF bc2Conf;
    u32 totalConfirmSize = 0;
    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
-        if (!bucketToLits[b].empty()) {
+        if (contains(bucketToLits, b)) {
            vector<hwlmLiteral> vl;
-            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+            for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
                vl.push_back(lits[lit_idx]);
            }

            DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
-            auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+            auto fc = getFDRConfirm(vl, make_small);
            totalConfirmSize += fc.size();
            bc2Conf.emplace(b, move(fc));
        }
    }

    u32 nBuckets = eng.getNumBuckets();
-    u32 totalConfSwitchSize = nBuckets * sizeof(u32);
-    u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
+    u32 totalConfSwitchSize = ROUNDUP_CL(nBuckets * sizeof(u32));
+    u32 totalSize = totalConfSwitchSize + totalConfirmSize;

-    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
    assert(buf); // otherwise would have thrown std::bad_alloc

    u32 *confBase = (u32 *)buf.get();
    u8 *ptr = buf.get() + totalConfSwitchSize;
+    assert(ISALIGNED_CL(ptr));

    for (const auto &m : bc2Conf) {
        const BucketIndex &idx = m.first;
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@@ -29,6 +29,7 @@
 #ifndef FDR_CONFIRM_RUNTIME_H
 #define FDR_CONFIRM_RUNTIME_H

+#include "scratch.h"
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
 #include "hwlm/hwlm.h"
@@ -41,13 +42,14 @@
 static really_inline
 void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
                 size_t i, hwlmcb_rv_t *control, u32 *last_match,
-                 u64a conf_key) {
+                 u64a conf_key, u64a *conf, u8 bit) {
    assert(i < a->len);
+    assert(i >= a->start_offset);
    assert(ISALIGNED(fdrc));

    const u8 * buf = a->buf;
    u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
-                           fdrc->nBitsOrSoleID);
+                           fdrc->nBits);
    u32 start = getConfirmLitIndex(fdrc)[c];
    if (likely(!start)) {
        return;
@@ -56,6 +58,10 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
    const struct LitInfo *li
        = (const struct LitInfo *)((const u8 *)fdrc + start);

+    struct hs_scratch *scratch = a->scratch;
+    assert(!scratch->fdr_conf);
+    scratch->fdr_conf = conf;
+    scratch->fdr_conf_offset = bit;
    u8 oldNext; // initialized in loop
    do {
        assert(ISALIGNED(li));
@@ -64,7 +70,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
            goto out;
        }

-        if ((*last_match == li->id) && (li->flags & NoRepeat)) {
+        if ((*last_match == li->id) && (li->flags & FDR_LIT_FLAG_NOREPEAT)) {
            goto out;
        }

@@ -86,99 +92,13 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
            goto out;
        }

-        if (unlikely(li->flags & ComplexConfirm)) {
-            const u8 *loc2 = buf + i - li->extended_size + 1;
-            if (loc2 < buf) {
-                u32 full_overhang = buf - loc2;
-                size_t len_history = a->len_history;
-                if (full_overhang > len_history) {
-                    goto out;
-                }
-            }
-        }
-
        *last_match = li->id;
-        *control = a->cb(loc - buf, i, li->id, a->ctxt);
+        *control = a->cb(i, li->id, scratch);
    out:
        oldNext = li->next; // oldNext is either 0 or an 'adjust' value
        li++;
    } while (oldNext);
-}
-
-// 'light-weight' confirmation function which is used by 1-mask Teddy;
-// in the 'confirmless' case it simply calls callback function,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBit1(const struct FDRConfirm *fdrc,
-                  const struct FDR_Runtime_Args *a, size_t i,
-                  hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        u32 id = fdrc->nBitsOrSoleID;
-
-        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
-            return;
-        }
-        *last_match = id;
-        *control = a->cb(i, i, id, a->ctxt);
-    }
-}
-
-// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
-// In the 'confirmless' case it makes fast 32-bit comparison,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBitMany(const struct FDRConfirm *fdrc,
-                     const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
-                     hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (i < a->start_offset) {
-        return;
-    }
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        const u32 id = fdrc->nBitsOrSoleID;
-        const u32 len = fdrc->soleLitSize;
-
-        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
-            return;
-        }
-
-        if (r == VECTORING && len > i - a->start_offset) {
-            if (len > i + a->len_history) {
-                return;
-            }
-
-            u32 cmp = (u32)a->buf[i] << 24;
-
-            if (len <= i) {
-                for (u32 j = 1; j <= len; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-            } else {
-                for (u32 j = 1; j <= i; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-                cmp |= (u32)(a->histBytes >> (40 + i * 8));
-            }
-
-            if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
-               return;
-            }
-        }
-        *last_match = id;
-        *control = a->cb(i - len, i, id, a->ctxt);
-    }
+    scratch->fdr_conf = NULL;
 }

 #endif
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,12 @@

 #include "fdr_compile.h"
 #include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
 #include "fdr_dump.h"
 #include "fdr_engine_description.h"
 #include "fdr_internal.h"
 #include "teddy_engine_description.h"
+#include "teddy_internal.h"
 #include "ue2common.h"

 #include <cstdio>
@@ -43,7 +45,7 @@
 #error No dump support!
 #endif

-using std::unique_ptr;
+using namespace std;

 namespace ue2 {

@@ -58,33 +60,127 @@ bool fdrIsTeddy(const FDR *fdr) {
    return !getFdrDescription(engine);
 }

-void fdrPrintStats(const FDR *fdr, FILE *f) {
-    const bool isTeddy = fdrIsTeddy(fdr);
+static
+void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
+    const u32 *lit_index = getConfirmLitIndex(fdrc);
+    u32 num_lits = 1U << fdrc->nBits;
+    u32 lits_used = count_if(lit_index, lit_index + num_lits,
+                             [](u32 idx) { return idx != 0; });

-    if (isTeddy) {
-        fprintf(f, "TEDDY:         %u\n", fdr->engineID);
-    } else {
-        fprintf(f, "FDR:           %u\n", fdr->engineID);
+    fprintf(f, "      load    %u/%u (%0.2f%%)\n", lits_used, num_lits,
+            (double)lits_used / (double)(num_lits)*100);
+}
+
+static
+void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
+                  FILE *f) {
+    const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
+    for (u32 i = 0; i < num_confirms; i++) {
+        const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
+        fprintf(f, "    confirm %u\n", i);
+        fprintf(f, "      andmsk  0x%016llx\n", fdrc->andmsk);
+        fprintf(f, "      mult    0x%016llx\n", fdrc->mult);
+        fprintf(f, "      nbits   %u\n", fdrc->nBits);
+        fprintf(f, "      groups  0x%016llx\n", fdrc->groups);
+        dumpLitIndex(fdrc, f);
    }
+}

-    if (isTeddy) {
-        auto des = getTeddyDescription(fdr->engineID);
-        if (des) {
-            fprintf(f, "    masks      %u\n", des->numMasks);
-            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
-            fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
-        } else {
-            fprintf(f, "   <unknown engine>\n");
+static
+void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
+    // dump reinforcement masks
+    for (u32 b = 0; b < num_tables; b++) {
+        fprintf(f, "    reinforcement table for bucket %u..%u:\n",
+                   b * 8, b * 8 + 7);
+        for (u32 i = 0; i <= N_CHARS; i++) {
+            fprintf(f, "      0x%02x: ", i);
+            for (u32 j = 0; j < 8; j++) {
+                u8 val = rmsk[b * ((N_CHARS + 1) * 8) + i * 8 + j];
+                for (u32 k = 0; k < 8; k++) {
+                    fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+                }
+                fprintf(f, " ");
+            }
+            fprintf(f, "\n");
        }
-    } else {
-        fprintf(f, "    domain     %u\n", fdr->domain);
-        fprintf(f, "    stride     %u\n", fdr->stride);
+        fprintf(f, "\n");
+    }
+}
+
+static
+void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
+    // dump nibble masks
+    fprintf(f, "    nibble masks:\n");
+    for (u32 i = 0; i < numMasks * 2; i++) {
+        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        for (u32 j = 0; j < 16 * maskWidth; j++) {
+            u8 val = baseMsk[i * 16 * maskWidth + j];
+            for (u32 k = 0; k < 8; k++) {
+                fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            fprintf(f, " ");
+        }
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpTeddy(const Teddy *teddy, FILE *f) {
+    fprintf(f, "TEDDY:         %u\n", teddy->engineID);
+    auto des = getTeddyDescription(teddy->engineID);
+    if (!des) {
+        fprintf(f, "   <unknown engine>\n");
+        return;
    }

-    fprintf(f, "    strings    ???\n");
+    fprintf(f, "    masks      %u\n", des->numMasks);
+    fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+    fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
+    fprintf(f, "    strings    %u\n", teddy->numStrings);
+    fprintf(f, "    size       %zu bytes\n", fdrSize((const FDR *)teddy));
+    fprintf(f, "    max length %u\n", teddy->maxStringLen);
+    fprintf(f, "    floodoff   %u (%x)\n", teddy->floodOffset,
+            teddy->floodOffset);
+    fprintf(f, "\n");
+
+    u32 maskWidth = des->getNumBuckets() / 8;
+    size_t headerSize = sizeof(Teddy);
+    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
+    const u8 *teddy_base = (const u8 *)teddy;
+    const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+    const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
+    dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
+    dumpTeddyReinforced(rmsk, maskWidth, f);
+    dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
+}
+
+static
+void dumpFDR(const FDR *fdr, FILE *f) {
+    fprintf(f, "FDR:           %u\n", fdr->engineID);
+    auto des = getFdrDescription(fdr->engineID);
+    if (!des) {
+        fprintf(f, "   <unknown engine>\n");
+        return;
+    }
+
+    fprintf(f, "    domain     %u\n", fdr->domain);
+    fprintf(f, "    stride     %u\n", fdr->stride);
+    fprintf(f, "    strings    %u\n", fdr->numStrings);
    fprintf(f, "    size       %zu bytes\n", fdrSize(fdr));
    fprintf(f, "    max length %u\n", fdr->maxStringLen);
    fprintf(f, "    floodoff   %u (%x)\n", fdr->floodOffset, fdr->floodOffset);
+    fprintf(f, "\n");
+
+    dumpConfirms(fdr, fdr->confOffset, des->getNumBuckets(), f);
+}
+
+void fdrPrintStats(const FDR *fdr, FILE *f) {
+    if (fdrIsTeddy(fdr)) {
+        dumpTeddy((const Teddy *)fdr, f);
+    } else {
+        dumpFDR(fdr, f);
+    }
 }

 } // namespace ue2
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -30,7 +30,6 @@
 #define FDR_ENGINE_DESCRIPTION_H

 #include "engine_description.h"
-#include "util/ue2_containers.h"

 #include <map>
 #include <memory>
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -36,6 +36,8 @@
 #include "ue2common.h"
 #include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback

+struct hs_scratch;
+
 typedef enum {
    NOT_CAUTIOUS, //!< not near a boundary (quantify?)
    VECTORING     //!< potentially vectoring
@@ -56,7 +58,6 @@ struct FDRFlood {

    u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids
    hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids
-    u32 len[FDR_FLOOD_MAX_IDS]; //!< lengths to go with the string ids
 };

 /** \brief FDR structure.
@@ -69,19 +70,18 @@ struct FDR {
    u32 engineID;
    u32 size;
    u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
    u32 floodOffset;
-
-    u8 stride; /* stride - how frequeuntly the data is consulted by the first
+    u8 stride; /* stride - how frequently the data is consulted by the first
                * stage matcher */
    u8 domain; /* number of bits used to index into main FDR table. This value
                * is used only of debugging/asserts. */
    u16 domainMask; /* pre-computed domain mask */
    u32 tabSize; /* pre-computed hashtable size in bytes */
-    u32 pad;
-
-    m128 start; /* initial start state to use at offset 0. The state has been set
-                 * up based on the min length of buckets to reduce the need for
-                 * pointless confirms. */
+    m128 start; /* initial start state to use at offset 0. The state has been
+                 * set up based on the min length of buckets to reduce the need
+                 * for pointless confirms. */
 };

 /** \brief FDR runtime arguments.
@@ -97,7 +97,7 @@ struct FDR_Runtime_Args {
    size_t len_history;
    size_t start_offset;
    HWLMCallback cb;
-    void *ctxt;
+    struct hs_scratch *scratch;
    const u8 *firstFloodDetect;
    const u64a histBytes;
 };
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -82,11 +82,10 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
        fl.ids[fl.idCount] = lit.id;
        fl.allGroups |= lit.groups;
        fl.groups[fl.idCount] = lit.groups;
-        fl.len[fl.idCount] = suffix;
        // when idCount gets to max_ids this flood no longer happens
        // only incremented one more time to avoid arithmetic overflow
        DEBUG_PRINTF("Added Flood for char '%c' suffix=%u len[%hu]=%u\n",
-                                        c, fl.suffix, fl.idCount, suffix);
+                     c, fl.suffix, fl.idCount, suffix);
        fl.idCount++;
   }
 }
@@ -182,8 +181,7 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
        printf("i is %02x fl->idCount is %hd fl->suffix is %d fl->allGroups is "
               "%016llx\n", i, fl.idCount, fl.suffix, fl.allGroups);
        for (u32 j = 0; j < fl.idCount; j++) {
-            printf("j is %d fl.groups[j] %016llx fl.len[j] %d \n", j,
-                   fl.groups[j], fl.len[j]);
+            printf("j is %d fl.groups[j] %016llx\n", j, fl.groups[j]);
        }
    }
 #endif
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@@ -94,7 +94,7 @@ const u8 * floodDetect(const struct FDR * fdr,
    const u8 * buf = a->buf;
    const size_t len = a->len;
    HWLMCallback cb = a->cb;
-    void * ctxt = a->ctxt;
+    struct hs_scratch *scratch = a->scratch;

    const u8 * ptr = *ptrPtr;
    // tryFloodDetect is never put in places where unconditional
@@ -196,120 +196,110 @@ const u8 * floodDetect(const struct FDR * fdr,
                for (u32 t = 0; t < floodSize && (*control & fl->allGroups);
                     t += 4) {
                    DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]);
-                    u32 len0 = fl->len[0] - 1;
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 0 - len0, i + t + 0, fl->ids[0], ctxt);
+                        *control = cb(i + t + 0, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
                    }
                }
                break;
            case 2:
                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[0]) {
                        *control =
-                            cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                            cb(i + t + 1, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                        *control = cb(i + t + 2, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 2 - len1, i + t + 2, fl->ids[1], ctxt);
+                        *control = cb(i + t + 2, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                        *control = cb(i + t + 3, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 3 - len1, i + t + 3, fl->ids[1], ctxt);
+                        *control = cb(i + t + 3, fl->ids[1], scratch);
                    }
                }
                break;
            case 3:
                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
-                    u32 len2 = fl->len[2] - 1;
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[2]) {
-                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                        *control = cb(i + t, fl->ids[2], scratch);
                    }
                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[2]) {
-                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
                    }
                }
                break;
            default:
                // slow generalized loop
                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
-                    u32 len0 = fl->len[0] - 1;
-                    u32 len1 = fl->len[1] - 1;
-                    u32 len2 = fl->len[2] - 1;
-                    u32 len3 = fl->len[3] - 1;

                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                        *control = cb(i + t, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                        *control = cb(i + t, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[2]) {
-                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                        *control = cb(i + t, fl->ids[2], scratch);
                    }
                    if (*control & fl->groups[3]) {
-                        *control = cb(i + t - len3, i + t, fl->ids[3], ctxt);
+                        *control = cb(i + t, fl->ids[3], scratch);
                    }

                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
                        if (*control & fl->groups[t2]) {
-                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                            *control = cb(i + t, fl->ids[t2], scratch);
                        }
                    }

                    if (*control & fl->groups[0]) {
-                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                        *control = cb(i + t + 1, fl->ids[0], scratch);
                    }
                    if (*control & fl->groups[1]) {
-                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                        *control = cb(i + t + 1, fl->ids[1], scratch);
                    }
                    if (*control & fl->groups[2]) {
-                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                        *control = cb(i + t + 1, fl->ids[2], scratch);
                    }
                    if (*control & fl->groups[3]) {
-                        *control = cb(i + t + 1 - len3, i + t + 1, fl->ids[3], ctxt);
+                        *control = cb(i + t + 1, fl->ids[3], scratch);
                    }

                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
                        if (*control & fl->groups[t2]) {
-                            *control = cb(i + t + 1 - (fl->len[t2] - 1), i + t + 1, fl->ids[t2], ctxt);
+                            *control = cb(i + t + 1, fl->ids[t2], scratch);
                        }
                    }
                }
@@ -320,7 +310,7 @@ const u8 * floodDetect(const struct FDR * fdr,
                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) {
                    for (u32 t2 = 0; t2 < fl->idCount; t2++) {
                        if (*control & fl->groups[t2]) {
-                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                            *control = cb(i + t, fl->ids[t2], scratch);
                        }
                    }
                }
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@@ -73,37 +73,37 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,

 #if defined(HAVE_AVX2)

-hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a,
-                                           hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);

-hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a,
-                                               hwlm_group_t control);
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control);

 #endif /* HAVE_AVX2 */

--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -42,10 +42,14 @@
 #include "teddy_engine_description.h"
 #include "grey.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/alloc.h"
 #include "util/compare.h"
+#include "util/container.h"
+#include "util/make_unique.h"
 #include "util/noncopyable.h"
 #include "util/popcount.h"
+#include "util/small_vector.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"

@@ -69,38 +73,58 @@ namespace {

 //#define TEDDY_DEBUG

+/** \brief Max number of Teddy masks we use. */
+static constexpr size_t MAX_NUM_MASKS = 4;
+
 class TeddyCompiler : noncopyable {
    const TeddyEngineDescription &eng;
    const Grey &grey;
    const vector<hwlmLiteral> &lits;
+    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
    bool make_small;

 public:
    TeddyCompiler(const vector<hwlmLiteral> &lits_in,
+                  map<BucketIndex, std::vector<LiteralIndex>> bucketToLits_in,
                  const TeddyEngineDescription &eng_in, bool make_small_in,
                  const Grey &grey_in)
-        : eng(eng_in), grey(grey_in), lits(lits_in), make_small(make_small_in) {
-    }
+        : eng(eng_in), grey(grey_in), lits(lits_in),
+          bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}

    bytecode_ptr<FDR> build();
-    bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };

 class TeddySet {
+    /**
+     * \brief Estimate of the max number of literals in a set, used to
+     * minimise allocations.
+     */
+    static constexpr size_t LITS_PER_SET = 20;
+
+    /** \brief Number of masks. */
    u32 len;
-    // nibbleSets is a series of bitfields over 16 predicates
-    // that represent the whether shufti nibble set
-    // so for num_masks = 4 we will represent our strings by
-    // 8 u16s in the vector that indicate what a shufti bucket
-    // would have to look like
-    vector<u16> nibbleSets;
-    set<u32> litIds;
+
+    /**
+     * \brief A series of bitfields over 16 predicates that represent the
+     * shufti nibble set.
+     *
+     * So for num_masks = 4 we will represent our strings by 8 u16s in the
+     * vector that indicate what a shufti bucket would have to look like.
+     */
+    small_vector<u16, MAX_NUM_MASKS * 2> nibbleSets;
+
+    /**
+     * \brief Sorted, unique set of literals. We maintain our own set in a
+     * sorted vector to minimise allocations.
+     */
+    small_vector<u32, LITS_PER_SET> litIds;
+
 public:
    explicit TeddySet(u32 len_in) : len(len_in), nibbleSets(len_in * 2, 0) {}
-    const set<u32> & getLits() const { return litIds; }
    size_t litCount() const { return litIds.size(); }
+    const small_vector<u32, LITS_PER_SET> &getLits() const { return litIds; }

-    bool operator<(const TeddySet & s) const {
+    bool operator<(const TeddySet &s) const {
        return litIds < s.litIds;
    }

@@ -116,11 +140,11 @@ public:
            printf("%u ", id);
        }
        printf("\n");
-        printf("Flood prone : %s\n", isRunProne()?"yes":"no");
+        printf("Flood prone : %s\n", isRunProne() ? "yes" : "no");
    }
 #endif

-    bool identicalTail(const TeddySet & ts) const {
+    bool identicalTail(const TeddySet &ts) const {
        return nibbleSets == ts.nibbleSets;
    }

@@ -131,24 +155,19 @@ public:
                u8 c = s[s.size() - i - 1];
                u8 c_hi = (c >> 4) & 0xf;
                u8 c_lo = c & 0xf;
-                nibbleSets[i*2] = 1 << c_lo;
+                nibbleSets[i * 2] = 1 << c_lo;
                if (lit.nocase && ourisalpha(c)) {
-                    nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
+                    nibbleSets[i * 2 + 1] =
+                        (1 << (c_hi & 0xd)) | (1 << (c_hi | 0x2));
                } else {
-                    nibbleSets[i*2+1] =  1 << c_hi;
+                    nibbleSets[i * 2 + 1] = 1 << c_hi;
                }
            } else {
-                nibbleSets[i*2] = nibbleSets[i*2+1] = 0xffff;
+                nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff;
            }
        }
-        litIds.insert(lit_id);
-    }
-
-    void merge(const TeddySet &ts) {
-        for (u32 i = 0; i < nibbleSets.size(); i++) {
-            nibbleSets[i] |= ts.nibbleSets[i];
-        }
-        litIds.insert(ts.litIds.begin(), ts.litIds.end());
+        litIds.push_back(lit_id);
+        sort_and_unique(litIds);
    }

    // return a value p from 0 .. MAXINT64 that gives p/MAXINT64
@@ -167,15 +186,15 @@ public:
    // a small fixed cost + the cost of traversing some sort of followup
    // (assumption is that the followup is linear)
    u64a heuristic() const {
-        return probability() * (2+litCount());
+        return probability() * (2 + litCount());
    }

    bool isRunProne() const {
        u16 lo_and = 0xffff;
        u16 hi_and = 0xffff;
        for (u32 i = 0; i < len; i++) {
-            lo_and &= nibbleSets[i*2];
-            hi_and &= nibbleSets[i*2+1];
+            lo_and &= nibbleSets[i * 2];
+            hi_and &= nibbleSets[i * 2 + 1];
        }
        // we're not flood-prone if there's no way to get
        // through with a flood
@@ -184,10 +203,27 @@ public:
        }
        return true;
    }
+
+    friend TeddySet merge(const TeddySet &a, const TeddySet &b) {
+        assert(a.nibbleSets.size() == b.nibbleSets.size());
+
+        TeddySet m(a);
+
+        for (size_t i = 0; i < m.nibbleSets.size(); i++) {
+            m.nibbleSets[i] |= b.nibbleSets[i];
+        }
+
+        m.litIds.insert(m.litIds.end(), b.litIds.begin(), b.litIds.end());
+        sort_and_unique(m.litIds);
+
+        return m;
+    }
 };

-bool TeddyCompiler::pack(map<BucketIndex,
-                             std::vector<LiteralIndex> > &bucketToLits) {
+static
+bool pack(const vector<hwlmLiteral> &lits,
+          const TeddyEngineDescription &eng,
+          map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits) {
    set<TeddySet> sts;

    for (u32 i = 0; i < lits.size(); i++) {
@@ -200,7 +236,8 @@ bool TeddyCompiler::pack(map<BucketIndex,
 #ifdef TEDDY_DEBUG
        printf("Size %zu\n", sts.size());
        for (const TeddySet &ts : sts) {
-            printf("\n"); ts.dump();
+            printf("\n");
+            ts.dump();
        }
        printf("\n===============================================\n");
 #endif
@@ -220,9 +257,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
                    continue;
                }

-                TeddySet tmpSet(eng.numMasks);
-                tmpSet.merge(s1);
-                tmpSet.merge(s2);
+                TeddySet tmpSet = merge(s1, s2);
                u64a newScore = tmpSet.heuristic();
                u64a oldScore = s1.heuristic() + s2.heuristic();
                if (newScore < oldScore) {
@@ -250,9 +285,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
        }

        // do the merge
-        TeddySet nts(eng.numMasks);
-        nts.merge(*m1);
-        nts.merge(*m2);
+        TeddySet nts = merge(*m1, *m2);
 #ifdef TEDDY_DEBUG
        printf("Merging\n");
        printf("m1 = \n");
@@ -282,66 +315,51 @@ bool TeddyCompiler::pack(map<BucketIndex,
    return true;
 }

-bytecode_ptr<FDR> TeddyCompiler::build() {
-    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
-        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
-        return nullptr;
-    }
+// this entry has all-zero mask to skip reinforcement
+#define NO_REINFORCEMENT N_CHARS

-#ifdef TEDDY_DEBUG
-    for (size_t i = 0; i < lits.size(); i++) {
-        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
-               lits[i].nocase ? "caseless" : "caseful");
-        for (size_t j = 0; j < lits[i].s.size(); j++) {
-            printf("%02x", ((u32)lits[i].s[j])&0xff);
-        }
-        printf("\n");
-    }
-#endif
+// this means every entry in reinforcement table
+#define ALL_CHAR_SET N_CHARS

-    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
-    if(eng.needConfirm(lits)) {
-        if (!pack(bucketToLits)) {
-            DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
-                         lits.size(), eng.getNumBuckets());
-            return nullptr;
+// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
+#define REINFORCED_MSK_LEN 8
+
+// reinforcement table size for each 8 buckets set
+#define RTABLE_SIZE ((N_CHARS + 1) * REINFORCED_MSK_LEN)
+
+static
+void initReinforcedTable(u8 *rmsk) {
+    u64a *mask = (u64a *)rmsk;
+    fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
+}
+
+static
+void fillReinforcedMskZero(u8 *rmsk) {
+    u8 *mc = rmsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
+    fill_n(mc, REINFORCED_MSK_LEN, 0x00);
+}
+
+static
+void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
+    assert(j > 0);
+    if (c == ALL_CHAR_SET) {
+        for (size_t i = 0; i < N_CHARS; i++) {
+            u8 *mc = rmsk + i * REINFORCED_MSK_LEN;
+            mc[j - 1] &= ~bmsk;
        }
    } else {
-        for (u32 i = 0; i < lits.size(); i++) {
-            bucketToLits[i].push_back(i);
-        }
+        u8 *mc = rmsk + c * REINFORCED_MSK_LEN;
+        mc[j - 1] &= ~bmsk;
    }
-    u32 maskWidth = eng.getNumBuckets() / 8;
+}

-    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-
-    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
-    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
-
-    size_t size = ROUNDUP_N(sizeof(Teddy) +
-                            maskLen +
-                            confirmTmp.size() +
-                            floodControlTmp.size(),
-                            16 * maskWidth);
-
-    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
-    assert(fdr); // otherwise would have thrown std::bad_alloc
-    Teddy *teddy = (Teddy *)fdr.get(); // ugly
-    u8 *teddy_base = (u8 *)teddy;
-
-    teddy->size = size;
-    teddy->engineID = eng.getID();
-    teddy->maxStringLen = verify_u32(maxLen(lits));
-
-    u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
-    ptr += confirmTmp.size();
-
-    teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
-    ptr += floodControlTmp.size();
-
-    u8 *baseMsk = teddy_base + sizeof(Teddy);
+static
+void fillNibbleMasks(const map<BucketIndex,
+                               vector<LiteralIndex>> &bucketToLits,
+                     const vector<hwlmLiteral> &lits,
+                     u32 numMasks, u32 maskWidth, size_t maskLen,
+                     u8 *baseMsk) {
+    memset(baseMsk, 0xff, maskLen);

    for (const auto &b2l : bucketToLits) {
        const u32 &bucket_id = b2l.first;
@@ -354,16 +372,18 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
            const u32 sz = verify_u32(l.s.size());

            // fill in masks
-            for (u32 j = 0; j < eng.numMasks; j++) {
-                u32 msk_id_lo = j * 2 * maskWidth + (bucket_id  / 8);
-                u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id  / 8);
+            for (u32 j = 0; j < numMasks; j++) {
+                const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
+                const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
+                const u32 lo_base = msk_id_lo * 16;
+                const u32 hi_base = msk_id_hi * 16;

                // if we don't have a char at this position, fill in i
                // locations in these masks with '1'
                if (j >= sz) {
                    for (u32 n = 0; n < 16; n++) {
-                        baseMsk[msk_id_lo * 16 + n] |= bmsk;
-                        baseMsk[msk_id_hi * 16 + n] |= bmsk;
+                        baseMsk[lo_base + n] &= ~bmsk;
+                        baseMsk[hi_base + n] &= ~bmsk;
                    }
                } else {
                    u8 c = l.s[sz - 1 - j];
@@ -382,51 +402,173 @@ bytecode_ptr<FDR> TeddyCompiler::build() {

                        for (u8 cm = 0; cm < 0x10; cm++) {
                            if ((cm & m_lo) == (cmp_lo & m_lo)) {
-                                baseMsk[msk_id_lo * 16 + cm] |= bmsk;
+                                baseMsk[lo_base + cm] &= ~bmsk;
                            }
                            if ((cm & m_hi) == (cmp_hi & m_hi)) {
-                                baseMsk[msk_id_hi * 16 + cm] |= bmsk;
+                                baseMsk[hi_base + cm] &= ~bmsk;
                            }
                        }
-                    } else{
+                    } else {
                        if (l.nocase && ourisalpha(c)) {
                            u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
-                            u32 cmHalfSet   = (0x20 >> hiShift) & 0xf;
-                            baseMsk[msk_id_hi * 16 + (n_hi & cmHalfClear)] |= bmsk;
-                            baseMsk[msk_id_hi * 16 + (n_hi | cmHalfSet  )] |= bmsk;
+                            u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
+                            baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
                        } else {
-                            baseMsk[msk_id_hi * 16 + n_hi] |= bmsk;
+                            baseMsk[hi_base + n_hi] &= ~bmsk;
                        }
-                        baseMsk[msk_id_lo * 16 + n_lo] |= bmsk;
+                        baseMsk[lo_base + n_lo] &= ~bmsk;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static
+void fillReinforcedTable(const map<BucketIndex,
+                                   vector<LiteralIndex>> &bucketToLits,
+                         const vector<hwlmLiteral> &lits,
+                         u8 *rtable_base, const u32 num_tables) {
+    vector<u8 *> tables;
+    for (u32 i = 0; i < num_tables; i++) {
+        tables.push_back(rtable_base + i * RTABLE_SIZE);
+    }
+
+    for (auto t : tables) {
+        initReinforcedTable(t);
+    }
+
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
+        u8 *rmsk = tables[bucket_id / 8];
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in reinforced masks
+            for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
+                if (sz - 1 < j) {
+                    fillReinforcedMsk(rmsk, ALL_CHAR_SET, j, bmsk);
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    if (l.nocase && ourisalpha(c)) {
+                        u8 c_up = c & 0xdf;
+                        fillReinforcedMsk(rmsk, c_up, j, bmsk);
+                        u8 c_lo = c | 0x20;
+                        fillReinforcedMsk(rmsk, c_lo, j, bmsk);
+                    } else {
+                        fillReinforcedMsk(rmsk, c, j, bmsk);
                    }
                }
            }
        }
    }

+    for (auto t : tables) {
+        fillReinforcedMskZero(t);
+    }
+}
+
+bytecode_ptr<FDR> TeddyCompiler::build() {
+    u32 maskWidth = eng.getNumBuckets() / 8;
+
+    size_t headerSize = sizeof(Teddy);
+    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+    size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth;
+
+    auto floodTable = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+    // Note: we place each major structure here on a cacheline boundary.
+    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+                  ROUNDUP_CL(reinforcedMaskLen) +
+                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();
+
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+    Teddy *teddy = (Teddy *)fdr.get(); // ugly
+    u8 *teddy_base = (u8 *)teddy;
+
+    // Write header.
+    teddy->size = size;
+    teddy->engineID = eng.getID();
+    teddy->maxStringLen = verify_u32(maxLen(lits));
+    teddy->numStrings = verify_u32(lits.size());
+
+    // Write confirm structures.
+    u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+              ROUNDUP_CL(reinforcedMaskLen);
+    assert(ISALIGNED_CL(ptr));
+    teddy->confOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, confirmTable.get(), confirmTable.size());
+    ptr += ROUNDUP_CL(confirmTable.size());
+
+    // Write flood control structures.
+    assert(ISALIGNED_CL(ptr));
+    teddy->floodOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, floodTable.get(), floodTable.size());
+    ptr += floodTable.size();
+
+    // Write teddy masks.
+    u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+    fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
+                    baseMsk);
+
+    // Write reinforcement masks.
+    u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+    fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+
+    return fdr;
+}
+
+
+static
+bool assignStringsToBuckets(
+                const vector<hwlmLiteral> &lits,
+                TeddyEngineDescription &eng,
+                map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
+    assert(eng.numMasks <= MAX_NUM_MASKS);
+    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+        return false;
+    }

 #ifdef TEDDY_DEBUG
-    for (u32 i = 0; i < eng.numMasks * 2; i++) {
-        for (u32 j = 0; j < 16; j++) {
-            u8 val = baseMsk[i * 16 + j];
-            for (u32 k = 0; k < 8; k++) {
-                printf("%s", ((val >> k) & 0x1) ? "1" : "0");
-            }
-            printf(" ");
+    for (size_t i = 0; i < lits.size(); i++) {
+        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+               lits[i].nocase ? "caseless" : "caseful");
+        for (size_t j = 0; j < lits[i].s.size(); j++) {
+            printf("%02x", ((u32)lits[i].s[j])&0xff);
        }
        printf("\n");
    }
 #endif

-    return fdr;
+    if (!pack(lits, eng, bucketToLits)) {
+        DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+                     lits.size(), eng.getNumBuckets());
+        return false;
+    }
+    return true;
 }

 } // namespace

-bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                        bool make_small, u32 hint,
-                                        const target_t &target,
-                                        const Grey &grey) {
+bytecode_ptr<FDR> teddyBuildTable(const HWLMProto &proto, const Grey &grey) {
+    TeddyCompiler tc(proto.lits, proto.bucketToLits, *(proto.teddyEng),
+                     proto.make_small, grey);
+    return tc.build();
+}
+
+
+unique_ptr<HWLMProto> teddyBuildProtoHinted(
+                        u8 engType, const vector<hwlmLiteral> &lits,
+                        bool make_small, u32 hint, const target_t &target) {
    unique_ptr<TeddyEngineDescription> des;
    if (hint == HINT_INVALID) {
        des = chooseTeddyEngine(target, lits);
@@ -436,8 +578,14 @@ bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
    if (!des) {
        return nullptr;
    }
-    TeddyCompiler tc(lits, *des, make_small, grey);
-    return tc.build();
+
+    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
+    if (!assignStringsToBuckets(lits, *des, bucketToLits)) {
+        return nullptr;
+    }
+
+    return ue2::make_unique<HWLMProto>(engType, move(des), lits,
+                                       bucketToLits, make_small);
 }

 } // namespace ue2
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -35,6 +35,7 @@
 #define TEDDY_COMPILE_H

 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
 #include "util/bytecode_ptr.h"

 #include <vector>
@@ -43,15 +44,16 @@ struct FDR;

 namespace ue2 {

+class TeddyEngineDescription;
 struct Grey;
 struct hwlmLiteral;
 struct target_t;

-bytecode_ptr<FDR> teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits,
-                                        bool make_small, u32 hint,
-                                        const target_t &target,
-                                        const Grey &grey);
+bytecode_ptr<FDR> teddyBuildTable(const HWLMProto &proto, const Grey &grey);

+std::unique_ptr<HWLMProto> teddyBuildProtoHinted(
+                          u8 engType, const std::vector<hwlmLiteral> &lits,
+                          bool make_small, u32 hint, const target_t &target);
 } // namespace ue2

 #endif // TEDDY_COMPILE_H
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
    return numMasks;
 }

-bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
-    if (packed || lits.size() > getNumBuckets()) {
-        return true;
-    }
-    for (const auto &lit : lits) {
-        if (lit.s.size() > numMasks || !lit.msk.empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
    static const TeddyEngineDef defns[] = {
        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@@ -55,7 +55,6 @@ public:
    explicit TeddyEngineDescription(const TeddyEngineDef &def);

    u32 getDefaultFloodSuffixLength() const override;
-    bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
 };

 std::unique_ptr<TeddyEngineDescription>
--- a/src/fdr/teddy_internal.h
+++ b/src/fdr/teddy_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -26,6 +26,28 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+/* Teddy bytecode layout:
+ * * |-----|
+ * * |     | struct Teddy
+ * * |-----|
+ * * |     | teddy masks
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 0..7
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table for bucket 8..15 (FAT teddy)
+ * * |     |
+ * * |-----|
+ * * |     | confirm
+ * * |     |
+ * * |     |
+ * * |-----|
+ * * |     | flood control
+ * * |     |
+ * * |-----|
+ */
+
 #ifndef TEDDY_INTERNAL_H
 #define TEDDY_INTERNAL_H

@@ -36,11 +58,9 @@ struct Teddy {
    u32 engineID;
    u32 size;
    u32 maxStringLen;
+    u32 numStrings;
+    u32 confOffset;
    u32 floodOffset;
-    u32 link;
-    u32 pad1;
-    u32 pad2;
-    u32 pad3;
 };

 #endif
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -38,8 +38,12 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"

 extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(HAVE_AVX2)
+extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
+#endif

 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
@@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 }

 // Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
 static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    union {
@@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
    uintptr_t copy_start;
    uintptr_t copy_len;

-    if (ptr >= lo) {
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 16) {
-            *p_mask = load128(p_mask_arr[16] + 16);
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
            return loadu128(ptr);
        }
-        *p_mask = load128(p_mask_arr[avail] + 16);
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
-    } else {
+    } else { // start zone
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(len_history, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        uintptr_t i;
-        for (i = start - need; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
        }
        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
-        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
-        copy_start = i;
-        copy_len = end - i;
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
    }

    // Runt block from the buffer.
@@ -152,6 +182,205 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
    return u.val128;
 }

+#if defined(HAVE_AVX2)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // HAVE_AVX2
+
+#if defined(HAVE_AVX512)
+// Note: p_mask is an output param that initialises a poison mask.
+//       u64a k = ones_u64a << n' >> m'; // m' < n'
+//       *p_mask = set_mask_m512(~k);
+//       means p_mask is consist of:
+//       (n' - m') poison bytes "0xff" at the beginning,
+//       followed by (64 - n') valid bytes "0x00",
+//       then followed by the rest m' poison bytes "0xff".
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,64)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=64)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
+                     const u32 nMasks) {
+    m512 val;
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 64) {
+            assert(start_offset - start <= 64);
+            u64a k = ones_u64a << (start_offset - start);
+            *p_mask = set_mask_m512(~k);
+            return loadu512(ptr);
+        }
+        assert(start_offset - start <= avail);
+        u64a k = ones_u64a << (64 - avail + start_offset - start)
+                           >> (64 - avail);
+        *p_mask = set_mask_m512(~k);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(hlen, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
+        val = loadu_maskz_m512(j, &hbuf[hlen - start]);
+        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
+        *p_mask = set_mask_m512(~k);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    assert(copy_len < 64);
+    assert(copy_len > 0);
+    u64a j = ones_u64a >> (64 - copy_len) << copy_start;
+    val = loadu_mask_m512(val, j, ptr);
+
+    return val;
+}
+#endif // HAVE_AVX512
+
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                CautionReason reason) {
@@ -190,63 +419,27 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
        if (!(fdrc->groups & *control)) {
            continue;
        }
+        u64a tmp = 0;
        u64a confVal = getConfVal(a, ptr, byte, reason);
        confWithBit(fdrc, a, ptr - a->buf + byte, control,
-                    last_match, confVal);
+                    last_match, confVal, &tmp, 0);
    } while (unlikely(*conf));
 }

 static really_inline
-void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                           const u32 *confBase, CautionReason reason,
-                           const struct FDR_Runtime_Args *a, const u8 *ptr,
-                           hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx  = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
-                     confVal);
-    } while (unlikely(*conf));
+const m128 *getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }

 static really_inline
-void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                              const u32 *confBase, CautionReason reason,
-                              const struct FDR_Runtime_Args *a, const u8 *ptr,
-                              hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
-                        last_match, confVal);
-    } while (unlikely(*conf));
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
 }

 static really_inline
-const m128 * getMaskBase(const struct Teddy *teddy) {
-    return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
-}
-
-static really_inline
-const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
-    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
-                         (numMask*32));
+const u32 *getConfBase(const struct Teddy *teddy) {
+    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
 }

 #endif /* TEDDY_RUNTIME_COMMON_H_ */
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -139,6 +139,7 @@ Grey::Grey(void) :
                   limitSmallWriteOutfixSize(1048576), // 1 MB
                   smallWriteMaxPatterns(10000),
                   smallWriteMaxLiterals(10000),
+                   smallWriteMergeBatchSize(20),
                   allowTamarama(true), // Tamarama engine
                   tamaChunkSize(100),
                   dumpFlags(0),
@@ -302,6 +303,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
        G_UPDATE(limitSmallWriteOutfixSize);
        G_UPDATE(smallWriteMaxPatterns);
        G_UPDATE(smallWriteMaxLiterals);
+        G_UPDATE(smallWriteMergeBatchSize);
        G_UPDATE(allowTamarama);
        G_UPDATE(tamaChunkSize);
        G_UPDATE(limitPatternCount);
--- a/src/grey.h
+++ b/src/grey.h
@@ -157,6 +157,7 @@ struct Grey {
    u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
    u32 smallWriteMaxPatterns; // only try small writes if fewer patterns
    u32 smallWriteMaxLiterals; // only try small writes if fewer literals
+    u32 smallWriteMergeBatchSize; // number of DFAs to merge in a batch

    // Tamarama engine
    bool allowTamarama;
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -227,10 +227,10 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
    target_t target_info = platform ? target_t(*platform)
                                    : get_current_target();

-    CompileContext cc(isStreaming, isVectored, target_info, g);
-    NG ng(cc, elements, somPrecision);
-
    try {
+        CompileContext cc(isStreaming, isVectored, target_info, g);
+        NG ng(cc, elements, somPrecision);
+
        for (unsigned int i = 0; i < elements; i++) {
            // Add this expression to the compiler
            try {
@@ -262,7 +262,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
                                           e.hasIndex ? (int)e.index : -1);
        return HS_COMPILER_ERROR;
    }
-    catch (std::bad_alloc) {
+    catch (const std::bad_alloc &) {
        *db = nullptr;
        *comp_error = const_cast<hs_compile_error_t *>(&hs_enomem);
        return HS_COMPILER_ERROR;
@@ -399,7 +399,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
        *error = generateCompileError(e);
        return HS_COMPILER_ERROR;
    }
-    catch (std::bad_alloc) {
+    catch (std::bad_alloc &) {
        *error = const_cast<hs_compile_error_t *>(&hs_enomem);
        return HS_COMPILER_ERROR;
    }
--- a/src/hs_common.h
+++ b/src/hs_common.h
@@ -561,6 +561,18 @@ hs_error_t HS_CDECL hs_valid_platform(void);
 */
 #define HS_ARCH_ERROR           (-11)

+/**
+ * Provided buffer was too small.
+ *
+ * This error indicates that there was insufficient space in the buffer. The
+ * call should be repeated with a larger provided buffer.
+ *
+ * Note: in this situation, it is normal for the amount of space required to be
+ * returned in the same manner as the used space would have been returned if the
+ * call was successful.
+ */
+#define HS_INSUFFICIENT_SPACE   (-12)
+
 /** @} */

 #ifdef __cplusplus
--- a/src/hs_runtime.h
+++ b/src/hs_runtime.h
@@ -321,6 +321,120 @@ hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
                                             match_event_handler onEvent,
                                             void *context);

+/**
+ * Creates a compressed representation of the provided stream in the buffer
+ * provided. This compressed representation can be converted back into a stream
+ * state by using @ref hs_expand_stream() or @ref hs_reset_and_expand_stream().
+ * The size of the compressed representation will be placed into @a used_space.
+ *
+ * If there is not sufficient space in the buffer to hold the compressed
+ * represention, @ref HS_INSUFFICIENT_SPACE will be returned and @a used_space
+ * will be populated with the amount of space required.
+ *
+ * Note: this function does not close the provided stream, you may continue to
+ * use the stream or to free it with @ref hs_close_stream().
+ *
+ * @param stream
+ *      The stream (as created by @ref hs_open_stream()) to be compressed.
+ *
+ * @param buf
+ *      Buffer to write the compressed representation into. Note: if the call is
+ *      just being used to determine the amount of space required, it is allowed
+ *      to pass NULL here and @a buf_space as 0.
+ *
+ * @param buf_space
+ *      The number of bytes in @a buf. If buf_space is too small, the call will
+ *      fail with @ref HS_INSUFFICIENT_SPACE.
+ *
+ * @param used_space
+ *      Pointer to where the amount of used space will be written to. The used
+ *      buffer space is always less than or equal to @a buf_space. If the call
+ *      fails with @ref HS_INSUFFICIENT_SPACE, this pointer will be used to
+ *      write out the amount of buffer space required.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_INSUFFICIENT_SPACE if the provided
+ *      buffer is too small.
+ */
+hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf,
+                                       size_t buf_space, size_t *used_space);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * into a new stream.
+ *
+ * Note: @a buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @a db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param db
+ *      The compiled pattern database that the compressed stream was opened
+ *      against.
+ *
+ * @param stream
+ *      On success, a pointer to the expanded @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db,
+                                     hs_stream_t **stream, const char *buf,
+                                     size_t buf_size);
+
+/**
+ * Decompresses a compressed representation created by @ref hs_compress_stream()
+ * on top of the 'to' stream. The 'to' stream will first be reset (reporting
+ * any EOD matches if a non-NULL @a onEvent callback handler is provided).
+ *
+ * Note: the 'to' stream must be opened against the same database as the
+ * compressed stream.
+ *
+ * Note: @a buf must correspond to a complete compressed representation created
+ * by @ref hs_compress_stream() of a stream that was opened against @a db. It is
+ * not always possible to detect misuse of this API and behaviour is undefined
+ * if these properties are not satisfied.
+ *
+ * @param to_stream
+ *      A pointer to the generated @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param buf
+ *      A compressed representation of a stream. These compressed forms are
+ *      created by @ref hs_compress_stream().
+ *
+ * @param buf_size
+ *      The size in bytes of the compressed representation.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @a onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream,
+                                               const char *buf, size_t buf_size,
+                                               hs_scratch_t *scratch,
+                                               match_event_handler onEvent,
+                                               void *context);
+
 /**
 * The block (non-streaming) regular expression scanner.
 *
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -170,7 +170,7 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
 }

 hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
-                      size_t start, HWLMCallback cb, void *ctxt,
+                      size_t start, HWLMCallback cb, struct hs_scratch *scratch,
                      hwlm_group_t groups) {
    assert(t);

@@ -184,25 +184,23 @@ hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,

    if (t->type == HWLM_ENGINE_NOOD) {
        DEBUG_PRINTF("calling noodExec\n");
-        return noodExec(HWLM_C_DATA(t), buf + start, len - start, start, cb,
-                        ctxt);
-    } else {
-        assert(t->type == HWLM_ENGINE_FDR);
-        const union AccelAux *aa = &t->accel0;
-        if ((groups & ~t->accel1_groups) == 0) {
-            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
-            aa = &t->accel1;
-        }
-        do_accel_block(aa, buf, len, &start);
-        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
-                     start);
-        return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, ctxt, groups);
+        return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
    }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_block(aa, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, scratch, groups);
 }

-hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
-                               size_t len, size_t start, HWLMCallback cb,
-                               void *ctxt, hwlm_group_t groups) {
+hwlm_error_t hwlmExecStreaming(const struct HWLM *t, size_t len, size_t start,
+                               HWLMCallback cb, struct hs_scratch *scratch,
+                               hwlm_group_t groups) {
    assert(t);
    assert(scratch);

@@ -224,24 +222,21 @@ hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
        // If we've been handed a start offset, we can use a block mode scan at
        // that offset.
        if (start) {
-            return noodExec(HWLM_C_DATA(t), buf + start, len - start, start,
-                            cb, ctxt);
+            return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch);
        } else {
            return noodExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, cb,
-                                     ctxt, scratch->fdr_temp_buf,
-                                     FDR_TEMP_BUF_SIZE);
+                                     scratch);
        }
-    } else {
-        // t->type == HWLM_ENGINE_FDR
-        const union AccelAux *aa = &t->accel0;
-        if ((groups & ~t->accel1_groups) == 0) {
-            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
-            aa = &t->accel1;
-        }
-        do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
-        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
-                     start);
-        return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len,
-                                start, cb, ctxt, groups);
    }
+
+    assert(t->type == HWLM_ENGINE_FDR);
+    const union AccelAux *aa = &t->accel0;
+    if ((groups & ~t->accel1_groups) == 0) {
+        DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+        aa = &t->accel1;
+    }
+    do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
+    DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start);
+    return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, start, cb,
+                            scratch, groups);
 }
--- a/src/hwlm/hwlm.h
+++ b/src/hwlm/hwlm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -71,14 +71,17 @@ typedef hwlm_group_t hwlmcb_rv_t;
 * designed for a different architecture). */
 #define HWLM_ERROR_UNKNOWN 2

+/** \brief Max length of the literal passed to HWLM. */
+#define HWLM_LITERAL_MAX_LEN 8
+
 struct hs_scratch;
 struct HWLM;

 /** \brief The type for an HWLM callback.
 *
- * This callback receives a start-of-match offset, an end-of-match offset, the
- * ID of the match and the context pointer that was passed into \ref
- * hwlmExec or \ref hwlmExecStreaming.
+ * This callback receives an end-of-match offset, the ID of the match and
+ * the context pointer that was passed into \ref hwlmExec or
+ * \ref hwlmExecStreaming.
 *
 * A callback return of \ref HWLM_TERMINATE_MATCHING will stop matching.
 *
@@ -92,8 +95,8 @@ struct HWLM;
 * belonging to the literal which was active at the when the end match location
 * was first reached.
 */
-typedef hwlmcb_rv_t (*HWLMCallback)(size_t start, size_t end, u32 id,
-                                    void *context);
+typedef hwlmcb_rv_t (*HWLMCallback)(size_t end, u32 id,
+                     struct hs_scratch *scratch);

 /** \brief Match strings in table.
 *
@@ -104,24 +107,26 @@ typedef hwlmcb_rv_t (*HWLMCallback)(size_t start, size_t end, u32 id,
 * Returns \ref HWLM_TERMINATED if scanning is cancelled due to the callback
 * returning \ref HWLM_TERMINATE_MATCHING.
 *
- * \p start is the first offset at which a match may start.
+ * \p start is the first offset at which a match may start. Note: match
+ * starts may include masks overhanging the main literal.
 *
 * The underlying engine may choose not to report any match which starts before
 * the first possible match of a literal which is in the initial group mask.
 */
 hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
-                      size_t start, HWLMCallback callback, void *context,
-                      hwlm_group_t groups);
+                      size_t start, HWLMCallback callback,
+                      struct hs_scratch *scratch, hwlm_group_t groups);

 /** \brief As for \ref hwlmExec, but a streaming case across two buffers.
- *
- * \p scratch is used to access fdr_temp_buf and to access the history buffer,
- * history length and the main buffer.
 *
 * \p len is the length of the main buffer to be scanned.
 *
 * \p start is an advisory hint representing the first offset at which a match
- * may start. Some underlying literal matches may not respect it.
+ * may start. Some underlying literal matches may not respect it. Note: match
+ * starts may include masks overhanging the main literal.
+ *
+ * \p scratch is used to access the history buffer, history length and
+ * the main buffer.
 *
 * Two buffers/lengths are provided. Matches that occur entirely within
 * the history buffer will not be reported by this function. The offsets
@@ -129,10 +134,9 @@ hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
 * match at byte 10 of the main buffer is reported as 10). Matches that start
 * in the history buffer will have starts reported with 'negative' values.
 */
-hwlm_error_t hwlmExecStreaming(const struct HWLM *tab,
-                               struct hs_scratch *scratch, size_t len,
-                               size_t start, HWLMCallback callback,
-                               void *context, hwlm_group_t groups);
+hwlm_error_t hwlmExecStreaming(const struct HWLM *tab, size_t len, size_t start,
+                               HWLMCallback callback,
+                               struct hs_scratch *scratch, hwlm_group_t groups);

 #ifdef __cplusplus
 }       /* extern "C" */
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -41,8 +41,12 @@
 #include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
+#include "fdr/fdr_compile_internal.h"
+#include "fdr/fdr_engine_description.h"
+#include "fdr/teddy_engine_description.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
+#include "util/make_unique.h"
 #include "util/ue2string.h"

 #include <cassert>
@@ -53,6 +57,28 @@ using namespace std;

 namespace ue2 {

+HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in)
+    : engType(engType_in), lits(move(lits_in)) {}
+
+HWLMProto::HWLMProto(u8 engType_in,
+                     unique_ptr<FDREngineDescription> eng_in,
+                     vector<hwlmLiteral> lits_in,
+                     map<u32, vector<u32>> bucketToLits_in,
+                     bool make_small_in)
+    : engType(engType_in), fdrEng(move(eng_in)), lits(move(lits_in)),
+      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+
+HWLMProto::HWLMProto(u8 engType_in,
+                     unique_ptr<TeddyEngineDescription> eng_in,
+                     vector<hwlmLiteral> lits_in,
+                     map<u32, vector<u32>> bucketToLits_in,
+                     bool make_small_in)
+    : engType(engType_in), teddyEng(move(eng_in)),
+      lits(move(lits_in)),
+      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+
+HWLMProto::~HWLMProto() {}
+
 static
 void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
 #ifdef DEBUG
@@ -89,17 +115,55 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
        return false;
    }

-    if (!lits.front().msk.empty()) {
-        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
-        return false;
-    }
-
    return true;
 }

-bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
-                             const CompileContext &cc,
+bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
                             UNUSED hwlm_group_t expected_groups) {
+    size_t engSize = 0;
+    shared_ptr<void> eng;
+
+    const auto &lits = proto.lits;
+    DEBUG_PRINTF("building table with %zu strings\n", lits.size());
+
+    if (proto.engType == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("build noodle table\n");
+        const hwlmLiteral &lit = lits.front();
+        auto noodle = noodBuildTable(lit);
+        if (noodle) {
+            engSize = noodle.size();
+        }
+        eng = move(noodle);
+    } else {
+        DEBUG_PRINTF("building a new deal\n");
+        auto fdr = fdrBuildTable(proto, cc.grey);
+        if (fdr) {
+            engSize = fdr.size();
+        }
+        eng = move(fdr);
+    }
+
+    if (!eng) {
+        return nullptr;
+    }
+
+    assert(engSize);
+    if (engSize > cc.grey.limitLiteralMatcherSize) {
+        throw ResourceLimitError();
+    }
+
+    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
+    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
+
+    h->type = proto.engType;
+    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
+
+    return h;
+}
+
+unique_ptr<HWLMProto>
+hwlmBuildProto(vector<hwlmLiteral> &lits, bool make_small,
+               const CompileContext &cc) {
    assert(!lits.empty());
    dumpLits(lits);

@@ -129,9 +193,7 @@ bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
        }
    }

-    u8 engType = 0;
-    size_t engSize = 0;
-    shared_ptr<void> eng;
+    unique_ptr<HWLMProto> proto;

    DEBUG_PRINTF("building table with %zu strings\n", lits.size());

@@ -139,39 +201,17 @@ bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,

    if (isNoodleable(lits, cc)) {
        DEBUG_PRINTF("build noodle table\n");
-        engType = HWLM_ENGINE_NOOD;
-        const hwlmLiteral &lit = lits.front();
-        auto noodle = noodBuildTable(lit);
-        if (noodle) {
-            engSize = noodle.size();
-        }
-        eng = move(noodle);
+        proto = ue2::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
    } else {
        DEBUG_PRINTF("building a new deal\n");
-        engType = HWLM_ENGINE_FDR;
-        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey);
-        if (fdr) {
-            engSize = fdr.size();
+        proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small,
+                              cc.target_info, cc.grey);
+        if (!proto) {
+            return nullptr;
        }
-        eng = move(fdr);
    }

-    if (!eng) {
-        return nullptr;
-    }
-
-    assert(engSize);
-    if (engSize > cc.grey.limitLiteralMatcherSize) {
-        throw ResourceLimitError();
-    }
-
-    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
-    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
-
-    h->type = engType;
-    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
-
-    return h;
+    return proto;
 }

 size_t hwlmSize(const HWLM *h) {
--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@@ -34,9 +34,11 @@
 #define HWLM_BUILD_H

 #include "hwlm.h"
+#include "hwlm_literal.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"

+#include <map>
 #include <memory>
 #include <vector>

@@ -44,15 +46,62 @@ struct HWLM;

 namespace ue2 {

+class FDREngineDescription;
+class TeddyEngineDescription;
 struct CompileContext;
 struct Grey;
-struct hwlmLiteral;
+
+/** \brief Class representing a literal matcher prototype. */
+struct HWLMProto {
+    /**
+     * \brief Engine type to distinguish noodle from FDR and Teddy.
+     */
+    u8 engType;
+
+    /**
+     * \brief FDR engine description.
+     */
+    std::unique_ptr<FDREngineDescription> fdrEng;
+
+    /**
+     * \brief Teddy engine description.
+     */
+    std::unique_ptr<TeddyEngineDescription> teddyEng;
+
+     /**
+      * \brief HWLM literals passed from Rose.
+      */
+    std::vector<hwlmLiteral> lits;
+
+    /**
+     * \brief Bucket assignment info in FDR and Teddy
+     */
+    std::map<u32, std::vector<u32>> bucketToLits;
+
+    /**
+     * \brief Flag to optimise matcher for small size from Rose.
+     */
+    bool make_small = false;
+
+    HWLMProto(u8 engType_in, std::vector<hwlmLiteral> lits_in);
+
+    HWLMProto(u8 engType_in, std::unique_ptr<FDREngineDescription> eng_in,
+              std::vector<hwlmLiteral> lits_in,
+              std::map<u32, std::vector<u32>> bucketToLits_in,
+              bool make_small_in);
+
+    HWLMProto(u8 engType_in, std::unique_ptr<TeddyEngineDescription> eng_in,
+              std::vector<hwlmLiteral> lits_in,
+              std::map<u32, std::vector<u32>> bucketToLits_in,
+              bool make_small_in);
+
+    ~HWLMProto();
+};

 /** \brief Build an \ref HWLM literal matcher runtime structure for a group of
 * literals.
 *
- * \param lits The group of literals.
- * \param make_small Optimise matcher for small size.
+ * \param proto Literal matcher prototype.
 * \param cc Compile context.
 * \param expected_groups FIXME: document me!
 *
@@ -60,10 +109,13 @@ struct hwlmLiteral;
 * may result in a nullptr return value, or a std::bad_alloc exception being
 * thrown.
 */
-bytecode_ptr<HWLM> hwlmBuild(const std::vector<hwlmLiteral> &lits,
-                             bool make_small, const CompileContext &cc,
+bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
                             hwlm_group_t expected_groups = HWLM_ALL_GROUPS);

+std::unique_ptr<HWLMProto>
+hwlmBuildProto(std::vector<hwlmLiteral> &lits, bool make_small,
+               const CompileContext &cc);
+
 /**
 * Returns an estimate of the number of repeated characters on the end of a
 * literal that will make a literal set of size \a numLiterals suffer
--- a/src/hwlm/hwlm_dump.cpp
+++ b/src/hwlm/hwlm_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -38,16 +38,19 @@
 #include "ue2common.h"
 #include "fdr/fdr_dump.h"
 #include "nfa/accel_dump.h"
-
-#include <cstdio>
+#include "util/dump_util.h"

 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif

+using namespace std;
+
 namespace ue2 {

-void hwlmPrintStats(const HWLM *h, FILE *f) {
+void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {
+    StdioFile f(base + ".txt", "w");
+
    switch (h->type) {
    case HWLM_ENGINE_NOOD:
        noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
--- a/src/hwlm/hwlm_dump.h
+++ b/src/hwlm/hwlm_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -35,16 +35,16 @@

 #ifdef DUMP_SUPPORT

-#include <cstdio>
+#include <string>

 struct HWLM;

 namespace ue2 {

 /** \brief Dump some information about the give HWLM structure. */
-void hwlmPrintStats(const HWLM *h, FILE *f);
+void hwlmGenerateDumpFiles(const HWLM *h, const std::string &base);

 } // namespace ue2

-#endif
-#endif
+#endif // DUMP_SUPPORT
+#endif // HWLM_DUMP_H
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@@ -42,12 +42,11 @@

 namespace ue2 {

-/** \brief Max length of the literal passed to HWLM. */
-#define HWLM_LITERAL_MAX_LEN 8
-
 /** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
 #define HWLM_MASKLEN 8

+#define INVALID_LIT_ID ~0U
+
 /** \brief Class representing a literal, fed to \ref hwlmBuild. */
 struct hwlmLiteral {
    std::string s; //!< \brief The literal itself.
@@ -67,6 +66,21 @@ struct hwlmLiteral {
     * can be quashed by the literal matcher. */
    bool noruns;

+    /** \brief included literal id. */
+    u32 included_id = INVALID_LIT_ID;
+
+    /** \brief Squash mask for FDR's confirm mask for included literals.
+     *
+     * In FDR confirm, if we have included literal in another bucket,
+     * we can use this mask to squash the bit for the bucket in FDR confirm
+     * mask and then run programs of included literal directly and avoid
+     * confirm work.
+     *
+     * This value is calculated in FDR compile code once bucket assignment is
+     * completed
+     */
+    u8 squash = 0;
+
    /** \brief Set of groups that literal belongs to.
     *
     * Use \ref HWLM_ALL_GROUPS for a literal that could match regardless of
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@@ -35,14 +35,33 @@

 #include "hwlm_literal.h"
 #include "noodle_internal.h"
+#include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/verify_types.h"
 #include "ue2common.h"

 #include <cstring> // for memcpy
+#include <vector>
+
+using std::vector;

 namespace ue2 {

+static
+u64a make_u64a_mask(const vector<u8> &v) {
+    assert(v.size() <= sizeof(u64a));
+    if (v.size() > sizeof(u64a)) {
+        throw std::exception();
+    }
+
+    u64a mask = 0;
+    size_t len = v.size();
+    unsigned char *m = (unsigned char *)&mask;
+    DEBUG_PRINTF("making mask len %zu\n", len);
+    memcpy(m, &v[0], len);
+    return mask;
+}
+
 static
 size_t findNoodFragOffset(const hwlmLiteral &lit) {
    const auto &s = lit.s;
@@ -67,30 +86,59 @@ size_t findNoodFragOffset(const hwlmLiteral &lit) {
 }

 bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
-    if (!lit.msk.empty()) {
-        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
-        return nullptr;
+    const auto &s = lit.s;
+
+    size_t mask_len = std::max(s.length(), lit.msk.size());
+    DEBUG_PRINTF("mask is %zu bytes\n", lit.msk.size());
+    assert(mask_len <= 8);
+    assert(lit.msk.size() == lit.cmp.size());
+
+    vector<u8> n_msk(mask_len);
+    vector<u8> n_cmp(mask_len);
+
+    for (unsigned i = mask_len - lit.msk.size(), j = 0; i < mask_len;
+         i++, j++) {
+        DEBUG_PRINTF("m[%u] %hhx c[%u] %hhx\n", i, lit.msk[j], i, lit.cmp[j]);
+        n_msk[i] = lit.msk[j];
+        n_cmp[i] = lit.cmp[j];
    }

-    const auto &s = lit.s;
-    size_t noodle_len = sizeof(noodTable) + s.length();
-    auto n = make_zeroed_bytecode_ptr<noodTable>(noodle_len);
+    size_t s_off = mask_len - s.length();
+    for (unsigned i = s_off; i < mask_len; i++) {
+        u8 c = s[i - s_off];
+        u8 si_msk = lit.nocase && ourisalpha(c) ? (u8)CASE_CLEAR : (u8)0xff;
+        n_msk[i] |= si_msk;
+        n_cmp[i] |= c & si_msk;
+        assert((n_cmp[i] & si_msk) == c);
+        DEBUG_PRINTF("m[%u] %hhx c[%u] %hhx '%c'\n", i, n_msk[i], i, n_cmp[i],
+                     ourisprint(c) ? (char)c : '.');
+    }
+
+    auto n = make_zeroed_bytecode_ptr<noodTable>(sizeof(noodTable));
    assert(n);
+    DEBUG_PRINTF("size of nood %zu\n", sizeof(noodTable));

    size_t key_offset = findNoodFragOffset(lit);

    n->id = lit.id;
-    n->len = verify_u32(s.length());
-    n->key_offset = verify_u32(key_offset);
+    n->single = s.length() == 1 ? 1 : 0;
+    n->key_offset = verify_u8(s.length() - key_offset);
    n->nocase = lit.nocase ? 1 : 0;
-    memcpy(n->str, s.c_str(), s.length());
+    n->key0 = s[key_offset];
+    if (n->single) {
+        n->key1 = 0;
+    } else {
+        n->key1 = s[key_offset + 1];
+    }
+    n->msk = make_u64a_mask(n_msk);
+    n->cmp = make_u64a_mask(n_cmp);
+    n->msk_len = mask_len;

    return n;
 }

-size_t noodSize(const noodTable *n) {
-    assert(n); // shouldn't call with null
-    return sizeof(*n) + n->len;
+size_t noodSize(const noodTable *) {
+    return sizeof(noodTable);
 }

 } // namespace ue2
@@ -102,13 +150,17 @@ namespace ue2 {

 void noodPrintStats(const noodTable *n, FILE *f) {
    fprintf(f, "Noodle table\n");
-    fprintf(f, "Len: %u Key Offset: %u\n", n->len, n->key_offset);
+    fprintf(f, "Key Offset: %u\n", n->key_offset);
+    fprintf(f, "Msk: %llx Cmp: %llx MskLen %u\n",
+            n->msk >> 8 * (8 - n->msk_len), n->cmp >> 8 * (8 - n->msk_len),
+            n->msk_len);
    fprintf(f, "String: ");
-    for (u32 i = 0; i < n->len; i++) {
-        if (isgraph(n->str[i]) && n->str[i] != '\\') {
-            fprintf(f, "%c", n->str[i]);
+    for (u32 i = 0; i < n->msk_len; i++) {
+        const u8 *m = (const u8 *)&n->cmp;
+        if (isgraph(m[i]) && m[i] != '\\') {
+            fprintf(f, "%c", m[i]);
        } else {
-            fprintf(f, "\\x%02hhx", n->str[i]);
+            fprintf(f, "\\x%02hhx", m[i]);
        }
    }
    fprintf(f, "\n");
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -32,6 +32,7 @@
 #include "hwlm.h"
 #include "noodle_engine.h"
 #include "noodle_internal.h"
+#include "scratch.h"
 #include "ue2common.h"
 #include "util/arch.h"
 #include "util/bitutils.h"
@@ -39,6 +40,7 @@
 #include "util/intrinsics.h"
 #include "util/join.h"
 #include "util/masked_move.h"
+#include "util/partial_store.h"
 #include "util/simd_utils.h"

 #include <ctype.h>
@@ -49,7 +51,7 @@
 struct cb_info {
    HWLMCallback cb; //!< callback function called on match
    u32 id; //!< ID to pass to callback on match
-    void *ctx; //!< caller-supplied context to pass to callback
+    struct hs_scratch *scratch; //!< scratch to pass to callback
    size_t offsetAdj; //!< used in streaming mode
 };

@@ -83,9 +85,8 @@ struct cb_info {
        while (unlikely(z)) {                                                  \
            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
            size_t matchPos = d - buf + pos;                                   \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
-            hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi,        \
-                                   matchPos);                                  \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);             \
            RETURN_IF_TERMINATED(rv);                                          \
        }                                                                      \
    } while (0)
@@ -95,9 +96,8 @@ struct cb_info {
        while (unlikely(z)) {                                                  \
            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
            size_t matchPos = d - buf + pos - 1;                               \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
-            hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1,        \
-                                   noCase, cbi, matchPos);                     \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
+            hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);             \
            RETURN_IF_TERMINATED(rv);                                          \
        }                                                                      \
    } while (0)
@@ -111,21 +111,26 @@ u8 caseClear8(u8 x, bool noCase) {
 // is used only for single chars with case insensitivity used correctly,
 // so it can go straight to the callback if we get this far.
 static really_inline
-hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                   size_t keyOffset, bool is_double, bool noCase,
-                   const struct cb_info *cbi, size_t pos) {
-    pos -= keyOffset;
-    if (is_double) {
-        if (pos + keyLen > len) {
-            return HWLM_SUCCESS;
-        }
-        if (cmpForward(buf + pos, key, keyLen, noCase)) { // ret 1 on mismatch
-            return HWLM_SUCCESS;
+hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
+                   char single, const struct cb_info *cbi, size_t pos) {
+    if (single) {
+        if (n->msk_len == 1) {
+            goto match;
        }
    }
-    pos += cbi->offsetAdj;
-    DEBUG_PRINTF("match @ %zu->%zu\n", pos, (pos + keyLen - 1));
-    hwlmcb_rv_t rv = cbi->cb(pos, (pos + keyLen - 1), cbi->id, cbi->ctx);
+    assert(len >= n->msk_len);
+    u64a v =
+        partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
+    DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
+    if ((v & n->msk) != n->cmp) {
+        /* mask didn't match */
+        return HWLM_SUCCESS;
+    }
+
+match:
+    pos -= cbi->offsetAdj;
+    DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
+    hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
    if (rv == HWLM_TERMINATE_MATCHING) {
        return HWLM_TERMINATED;
    }
@@ -147,38 +152,43 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
 #endif

 static really_inline
-hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, const struct cb_info *cbi) {
+hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
+                            const struct cb_info *cbi) {

-    const MASK_TYPE mask1 = getMask(key[0], noCase);
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
    const MASK_TYPE caseMask = getCaseMask();

+    size_t offset = start + n->msk_len - 1;
+    size_t end = len;
+    assert(offset < end);
+
 #if !defined(HAVE_AVX512)
    hwlm_error_t rv;
-    size_t end = len;

-    if (len < CHUNKSIZE) {
-        rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len);
+    if (end - offset < CHUNKSIZE) {
+        rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                             end);
        return rv;
    }

-    if (len == CHUNKSIZE) {
-        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
-                                 0, len);
+    if (end - offset == CHUNKSIZE) {
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, end);
        return rv;
    }

    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data, CHUNKSIZE) - data;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
    uintptr_t last = data + end;
    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
-    uintptr_t s3Start = len - CHUNKSIZE;
+    uintptr_t s3Start = end - CHUNKSIZE;

-    if (s2Start) {
+    if (offset != s2Start) {
        // first scan out to the fast scan starting point
        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
-                                 0, s2Start);
+        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 cbi, offset, s2Start);
        RETURN_IF_TERMINATED(rv);
    }

@@ -186,68 +196,70 @@ hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
        // scan as far as we can, bounded by the last point this key can
        // possibly match
        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
-        rv = scanSingleFast(buf, len, key, noCase, caseMask, mask1, cbi,
-                            s2Start, s2End);
+        rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
+                            s2End);
        RETURN_IF_TERMINATED(rv);
    }

    // if we are done bail out
-    if (s2End == end) {
+    if (s2End == len) {
        return HWLM_SUCCESS;
    }

-    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, end);
-    rv = scanSingleUnaligned(buf, len, s3Start, key, noCase, caseMask, mask1,
-                             cbi, s2End, end);
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
+    rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
+                             s2End, len);

    return rv;
 #else // HAVE_AVX512
-    return scanSingle512(buf, len, key, noCase, caseMask, mask1, cbi);
+    return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
+                         end);
 #endif
 }

 static really_inline
-hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
+hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start, bool noCase,
                            const struct cb_info *cbi) {
    // we stop scanning for the key-fragment when the rest of the key can't
    // possibly fit in the remaining buffer
-    size_t end = len - keyLen + keyOffset + 2;
+    size_t end = len - n->key_offset + 2;
+
+    // the first place the key can match
+    size_t offset = start + n->msk_len - n->key_offset;

    const MASK_TYPE caseMask = getCaseMask();
-    const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase);
-    const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase);
+    const MASK_TYPE mask1 = getMask(n->key0, noCase);
+    const MASK_TYPE mask2 = getMask(n->key1, noCase);

 #if !defined(HAVE_AVX512)
    hwlm_error_t rv;

-    if (end - keyOffset < CHUNKSIZE) {
-        rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                             mask1, mask2, cbi, keyOffset, end);
+    if (end - offset < CHUNKSIZE) {
+        rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                             offset, end);
        return rv;
    }
-    if (end - keyOffset == CHUNKSIZE) {
-        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
-                                 noCase, caseMask, mask1, mask2, cbi, keyOffset,
-                                 end);
+    if (end - offset == CHUNKSIZE) {
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, offset, end);
        return rv;
    }

    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + keyOffset, CHUNKSIZE) - data;
+    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
    uintptr_t s1End = s2Start + 1;
    uintptr_t last = data + end;
    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
    uintptr_t s3Start = end - CHUNKSIZE;
-    uintptr_t off = keyOffset;
+    uintptr_t off = offset;

-    if (s2Start != keyOffset) {
+    if (s2Start != off) {
        // first scan out to the fast scan starting point plus one char past to
        // catch the key on the overlap
-        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
-                                 noCase, caseMask, mask1, mask2, cbi, off,
-                                 s1End);
+        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
+        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
+                                 mask2, cbi, off, s1End);
        RETURN_IF_TERMINATED(rv);
    }
    off = s1End;
@@ -261,8 +273,8 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
        // scan as far as we can, bounded by the last point this key can
        // possibly match
        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
-        rv = scanDoubleFast(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                            mask1, mask2, cbi, s2Start, s2End);
+        rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                            s2Start, s2End);
        RETURN_IF_TERMINATED(rv);
        off = s2End;
    }
@@ -273,130 +285,158 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
    }

    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
-    rv = scanDoubleUnaligned(buf, len, s3Start, key, keyLen, keyOffset, noCase,
-                             caseMask, mask1, mask2, cbi, off, end);
+    rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
+                             mask2, cbi, off, end);

    return rv;
 #else // AVX512
-    return scanDouble512(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                         mask1, mask2, cbi, keyOffset, end);
+    return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                         offset, end);
 #endif // AVX512
 }


 static really_inline
-hwlm_error_t scanSingleNoCase(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
                              const struct cb_info *cbi) {
-    return scanSingleMain(buf, len, key, 1, cbi);
+    return scanSingleMain(n, buf, len, start, 1, cbi);
 }

 static really_inline
-hwlm_error_t scanSingleCase(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
                            const struct cb_info *cbi) {
-    return scanSingleMain(buf, len, key, 0, cbi);
+    return scanSingleMain(n, buf, len, start, 0, cbi);
 }

 // Single-character specialisation, used when keyLen = 1
 static really_inline
-hwlm_error_t scanSingle(const u8 *buf, size_t len, const u8 *key, bool noCase,
-                        const struct cb_info *cbi) {
-    if (!ourisalpha(key[0])) {
+hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+    if (!ourisalpha(n->key0)) {
        noCase = 0; // force noCase off if we don't have an alphabetic char
    }

    // kinda ugly, but this forces constant propagation
    if (noCase) {
-        return scanSingleNoCase(buf, len, key, cbi);
+        return scanSingleNoCase(n, buf, len, start, cbi);
    } else {
-        return scanSingleCase(buf, len, key, cbi);
+        return scanSingleCase(n, buf, len, start, cbi);
    }
 }


 static really_inline
-hwlm_error_t scanDoubleNoCase(const u8 *buf, size_t len, const u8 *key,
-                              size_t keyLen, size_t keyOffset,
+hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
+                              size_t len, size_t start,
                              const struct cb_info *cbi) {
-    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 1, cbi);
+    return scanDoubleMain(n, buf, len, start, 1, cbi);
 }

 static really_inline
-hwlm_error_t scanDoubleCase(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset,
+hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t start,
                            const struct cb_info *cbi) {
-    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 0, cbi);
+    return scanDoubleMain(n, buf, len, start, 0, cbi);
 }


 static really_inline
-hwlm_error_t scanDouble(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                        size_t keyOffset, bool noCase,
-                        const struct cb_info *cbi) {
+hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
    // kinda ugly, but this forces constant propagation
    if (noCase) {
-        return scanDoubleNoCase(buf, len, key, keyLen, keyOffset, cbi);
+        return scanDoubleNoCase(n, buf, len, start, cbi);
    } else {
-        return scanDoubleCase(buf, len, key, keyLen, keyOffset, cbi);
+        return scanDoubleCase(n, buf, len, start, cbi);
    }
 }

 // main entry point for the scan code
 static really_inline
-hwlm_error_t scan(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
-                  size_t keyOffset, bool noCase, const struct cb_info *cbi) {
-    if (len < keyLen) {
+hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
+                  size_t start, char single, bool noCase,
+                  const struct cb_info *cbi) {
+    if (len - start < n->msk_len) {
        // can't find string of length keyLen in a shorter buffer
        return HWLM_SUCCESS;
    }

-    if (keyLen == 1) {
-        assert(keyOffset == 0);
-        return scanSingle(buf, len, key, noCase, cbi);
+    if (single) {
+        return scanSingle(n, buf, len, start, noCase, cbi);
    } else {
-        return scanDouble(buf, len, key, keyLen, keyOffset, noCase, cbi);
+        return scanDouble(n, buf, len, start, noCase, cbi);
    }
 }

 /** \brief Block-mode scanner. */
 hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
-                      size_t offset_adj, HWLMCallback cb, void *ctxt) {
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch) {
    assert(n && buf);

-    struct cb_info cbi = { cb, n->id, ctxt, offset_adj };
-    DEBUG_PRINTF("nood scan of %zu bytes for %*s\n", len, n->len, n->str);
-    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
+                 (const char *)&n->cmp, buf);
+
+    return scan(n, buf, len, start, n->single, n->nocase, &cbi);
 }

 /** \brief Streaming-mode scanner. */
 hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
-                               HWLMCallback cb, void *ctxt, u8 *temp_buf,
-                               UNUSED size_t temp_buffer_size) {
+                               HWLMCallback cb, struct hs_scratch *scratch) {
    assert(n);

-    struct cb_info cbi = {cb, n->id, ctxt, 0};
-    hwlm_error_t rv;
+    if (len + hlen < n->msk_len) {
+        DEBUG_PRINTF("not enough bytes for a match\n");
+        return HWLM_SUCCESS;
+    }

-    if (hlen) {
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
+                 n->msk_len, (const char *)&n->cmp, buf);
+
+    if (hlen && n->msk_len > 1) {
+        /*
+         * we have history, so build up a buffer from enough of the history
+         * buffer plus what we've been given to scan. Since this is relatively
+         * short, just check against msk+cmp per byte offset for matches.
+         */
        assert(hbuf);
+        u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
+        memset(temp_buf, 0, sizeof(temp_buf));

-        size_t tl1 = MIN(n->len - 1, hlen);
-        size_t tl2 = MIN(n->len - 1, len);
-        size_t temp_len = tl1 + tl2;
-        assert(temp_len < temp_buffer_size);
-        memcpy(temp_buf, hbuf + hlen - tl1, tl1);
-        memcpy(temp_buf + tl1, buf, tl2);
+        assert(n->msk_len);
+        size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
+        size_t tl2 = MIN((size_t)n->msk_len - 1, len);

-        cbi.offsetAdj = -tl1;
-        rv = scan(temp_buf, temp_len, n->str, n->len, n->key_offset, n->nocase,
-                  &cbi);
-        if (rv == HWLM_TERMINATED) {
-            return HWLM_TERMINATED;
+        assert(tl1 + tl2 <= sizeof(temp_buf));
+        assert(tl1 + tl2 >= n->msk_len);
+        assert(tl1 <= sizeof(u64a));
+        assert(tl2 <= sizeof(u64a));
+        DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
+
+        unaligned_store_u64a(temp_buf,
+                             partial_load_u64a(hbuf + hlen - tl1, tl1));
+        unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
+
+        for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
+            u64a v = unaligned_load_u64a(temp_buf + i);
+            if ((v & n->msk) == n->cmp) {
+                size_t m_end = -tl1 + i + n->msk_len - 1;
+                DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
+                hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
+                if (rv == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATED;
+                }
+            }
        }
    }

    assert(buf);

    cbi.offsetAdj = 0;
-    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+    return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
 }
--- a/src/hwlm/noodle_engine.h
+++ b/src/hwlm/noodle_engine.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -41,16 +41,17 @@ extern "C"
 #endif

 struct noodTable;
+struct hs_scratch;

 /** \brief Block-mode scanner. */
 hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
-                      size_t offset_adj, HWLMCallback cb, void *ctxt);
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch);

 /** \brief Streaming-mode scanner. */
 hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
                               size_t hlen, const u8 *buf, size_t len,
-                               HWLMCallback cb, void *ctxt, u8 *temp_buf,
-                               size_t temp_buffer_size);
+                               HWLMCallback cb, struct hs_scratch *scratch);

 #ifdef __cplusplus
 }       /* extern "C" */
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -38,10 +38,11 @@ static really_inline m256 getCaseMask(void) {
 }

 static really_inline
-hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, bool noCase, m256 caseMask,
-                                 m256 mask1, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
    const u8 *d = buf + offset;
    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
    const size_t l = end - start;
@@ -66,11 +67,11 @@ hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
 }

 static really_inline
-hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, size_t keyLen, size_t keyOffset,
-                                 bool noCase, m256 caseMask, m256 mask1,
-                                 m256 mask2, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m256 caseMask, m256 mask1, m256 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
    const u8 *d = buf + offset;
    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
    size_t l = end - start;
@@ -100,8 +101,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
 // alignment boundary if needed and to finish off data that the aligned scan
 // function can't handle (due to small/unaligned chunk at end)
 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m256 caseMask, m256 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
                             const struct cb_info *cbi, size_t start,
                             size_t end) {
    const u8 *d = buf + start;
@@ -140,11 +141,10 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m256 caseMask, m256 mask1, m256 mask2,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m256 caseMask, m256 mask1,
+                             m256 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
    const u8 *d = buf + start;
    size_t l = end - start;
    if (!l) {
@@ -182,8 +182,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, m256 caseMask, m256 mask1,
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
                            const struct cb_info *cbi, size_t start,
                            size_t end) {
    const u8 *d = buf + start, *e = buf + end;
@@ -203,10 +203,9 @@ hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
-                            m256 caseMask, m256 mask1, m256 mask2,
-                            const struct cb_info *cbi, size_t start,
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m256 caseMask, m256 mask1,
+                            m256 mask2, const struct cb_info *cbi, size_t start,
                            size_t end) {
    const u8 *d = buf + start, *e = buf + end;
    DEBUG_PRINTF("start %zu end %zu \n", start, end);
--- a/src/hwlm/noodle_engine_avx512.c
+++ b/src/hwlm/noodle_engine_avx512.c
@@ -43,8 +43,8 @@ m512 getCaseMask(void) {
 // alignment boundary if needed and to finish off data that the aligned scan
 // function can't handle (due to small/unaligned chunk at end)
 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m512 caseMask, m512 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
                             const struct cb_info *cbi, size_t start,
                             size_t end) {
    const u8 *d = buf + start;
@@ -73,11 +73,12 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,
+hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
                           bool noCase, m512 caseMask, m512 mask1,
-                           const struct cb_info *cbi) {
-    const u8 *d = buf;
-    const u8 *e = buf + len;
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
    DEBUG_PRINTF("start %p end %p \n", d, e);
    assert(d < e);
    if (d + 64 >= e) {
@@ -86,8 +87,8 @@ hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,

    // peel off first part to cacheline boundary
    const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0,
-                        d1 - d) == HWLM_TERMINATED) {
+    if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
+                        d1 - buf) == HWLM_TERMINATED) {
        return HWLM_TERMINATED;
    }
    d = d1;
@@ -106,16 +107,15 @@ tail:
    DEBUG_PRINTF("d %p e %p \n", d, e);
    // finish off tail

-    return scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, d - buf,
+    return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
                           e - buf);
 }

 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m512 caseMask, m512 mask1, m512 mask2,
-                             const struct cb_info *cbi, u64a *lastz0,
-                             size_t start, size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m512 caseMask, m512 mask1,
+                             m512 mask2, const struct cb_info *cbi,
+                             u64a *lastz0, size_t start, size_t end) {
    DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
    const u8 *d = buf + start;
    ptrdiff_t scan_len = end - start;
@@ -142,9 +142,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,
-                           size_t keyLen, size_t keyOffset, bool noCase,
-                           m512 caseMask, m512 mask1, m512 mask2,
+hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
+                           bool noCase, m512 caseMask, m512 mask1, m512 mask2,
                           const struct cb_info *cbi, size_t start,
                           size_t end) {
    const u8 *d = buf + start;
@@ -158,9 +157,8 @@ hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,

    // peel off first part to cacheline boundary
    const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                        mask1, mask2, cbi, &lastz0, start,
-                        d1 - buf) == HWLM_TERMINATED) {
+    if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                        &lastz0, start, d1 - buf) == HWLM_TERMINATED) {
        return HWLM_TERMINATED;
    }
    d = d1;
@@ -188,6 +186,6 @@ tail:
    DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
    // finish off tail

-    return scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
-                           mask1, mask2, cbi, &lastz0, d - buf, end);
+    return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
+                           &lastz0, d - buf, end);
 }
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -38,8 +38,8 @@ static really_inline m128 getCaseMask(void) {
 }

 static really_inline
-hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
-                             bool noCase, m128 caseMask, m128 mask1,
+hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
                             const struct cb_info *cbi, size_t start,
                             size_t end) {
    const u8 *d = buf + start;
@@ -67,10 +67,11 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, bool noCase, m128 caseMask,
-                                 m128 mask1, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
    const u8 *d = buf + offset;
    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
    const size_t l = end - start;
@@ -96,11 +97,10 @@ hwlm_error_t scanSingleUnaligned(const u8 *buf, size_t len, size_t offset,
 }

 static really_inline
-hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
-                             size_t keyLen, size_t keyOffset, bool noCase,
-                             m128 caseMask, m128 mask1, m128 mask2,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
+hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
+                             size_t len, bool noCase, m128 caseMask, m128 mask1,
+                             m128 mask2, const struct cb_info *cbi,
+                             size_t start, size_t end) {
    const u8 *d = buf + start;
    size_t l = end - start;
    if (!l) {
@@ -128,11 +128,11 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
-                                 const u8 *key, size_t keyLen, size_t keyOffset,
-                                 bool noCase, m128 caseMask, m128 mask1,
-                                 m128 mask2, const struct cb_info *cbi,
-                                 size_t start, size_t end) {
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 size_t len, size_t offset, bool noCase,
+                                 m128 caseMask, m128 mask1, m128 mask2,
+                                 const struct cb_info *cbi, size_t start,
+                                 size_t end) {
    const u8 *d = buf + offset;
    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
    size_t l = end - start;
@@ -158,8 +158,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
 }

 static really_inline
-hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
-                            bool noCase, m128 caseMask, m128 mask1,
+hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
                            const struct cb_info *cbi, size_t start,
                            size_t end) {
    const u8 *d = buf + start, *e = buf + end;
@@ -179,10 +179,9 @@ hwlm_error_t scanSingleFast(const u8 *buf, size_t len, const u8 *key,
 }

 static really_inline
-hwlm_error_t scanDoubleFast(const u8 *buf, size_t len, const u8 *key,
-                            size_t keyLen, size_t keyOffset, bool noCase,
-                            m128 caseMask, m128 mask1, m128 mask2,
-                            const struct cb_info *cbi, size_t start,
+hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
+                            size_t len, bool noCase, m128 caseMask, m128 mask1,
+                            m128 mask2, const struct cb_info *cbi, size_t start,
                            size_t end) {
    const u8 *d = buf + start, *e = buf + end;
    assert(d < e);
--- a/src/hwlm/noodle_internal.h
+++ b/src/hwlm/noodle_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -30,18 +30,22 @@
 * \brief Data structures for Noodle literal matcher engine.
 */

-#ifndef NOODLE_INTERNAL_H_25D751C42E34A6
-#define NOODLE_INTERNAL_H_25D751C42E34A6
+#ifndef NOODLE_INTERNAL_H
+#define NOODLE_INTERNAL_H

 #include "ue2common.h"

 struct noodTable {
    u32 id;
-    u32 len;
-    u32 key_offset;
-    u8  nocase;
-    u8  str[];
+    u64a msk;
+    u64a cmp;
+    u8 msk_len;
+    u8 key_offset;
+    u8 nocase;
+    u8 single;
+    u8 key0;
+    u8 key1;
 };

-#endif /* NOODLE_INTERNAL_H_25D751C42E34A6 */
+#endif /* NOODLE_INTERNAL_H */

--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -41,6 +41,8 @@
 #include "util/verify_types.h"

 #include <sstream>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #define PATHS_LIMIT 500
@@ -65,6 +67,17 @@ void dump_paths(const Container &paths) {
    DEBUG_PRINTF("%zu paths\n", paths.size());
 }

+static
+vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
+    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        rv.at(rdfa.alpha_remap[i]).set(i);
+    }
+
+    return rv;
+}
+
 static
 bool is_useful_path(const vector<path> &good, const path &p) {
    for (const auto &g : good) {
@@ -98,9 +111,10 @@ path append(const path &orig, const CharReach &cr, u32 new_dest) {
 }

 static
-void extend(const raw_dfa &rdfa, const path &p,
-            map<u32, vector<path>> &all, vector<path> &out) {
-    dstate s = rdfa.states[p.dest];
+void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
+            const path &p, unordered_map<u32, vector<path>> &all,
+            vector<path> &out) {
+    const dstate &s = rdfa.states[p.dest];

    if (!p.reach.empty() && p.reach.back().none()) {
        out.push_back(p);
@@ -125,9 +139,9 @@ void extend(const raw_dfa &rdfa, const path &p,
    }

    flat_map<u32, CharReach> dest;
-    for (unsigned i = 0; i < N_CHARS; i++) {
-        u32 succ = s.next[rdfa.alpha_remap[i]];
-        dest[succ].set(i);
+    for (u32 i = 0; i < rev_map.size(); i++) {
+        u32 succ = s.next[i];
+        dest[succ] |= rev_map[i];
    }

    for (const auto &e : dest) {
@@ -148,13 +162,14 @@ void extend(const raw_dfa &rdfa, const path &p,
 static
 vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
                                         dstate_id_t base, u32 len) {
+    const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
    vector<path> paths{path(base)};
-    map<u32, vector<path>> all;
+    unordered_map<u32, vector<path>> all;
    all[base].push_back(path(base));
    for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
        vector<path> next_gen;
        for (const auto &p : paths) {
-            extend(rdfa, p, all, next_gen);
+            extend(rdfa, rev_map, p, all, next_gen);
        }

        paths = move(next_gen);
@@ -195,17 +210,6 @@ bool better(const AccelScheme &a, const AccelScheme &b) {
    return a.cr.count() < b.cr.count();
 }

-static
-vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
-    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
-
-    for (u32 i = 0; i < N_CHARS; i++) {
-        rv.at(rdfa.alpha_remap[i]).set(i);
-    }
-
-    return rv;
-}
-
 static
 bool double_byte_ok(const AccelScheme &info) {
    return !info.double_byte.empty() &&
@@ -225,16 +229,16 @@ bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
 }

 static
-vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
-                                        const CharReach &escape) {
-    set<u16> rv;
+flat_set<u16> find_nonexit_symbols(const raw_dfa &rdfa,
+                                   const CharReach &escape) {
+    flat_set<u16> rv;
    CharReach nonexit = ~escape;
-    for (auto i = nonexit.find_first(); i != CharReach::npos;
+    for (auto i = nonexit.find_first(); i != nonexit.npos;
         i = nonexit.find_next(i)) {
        rv.insert(rdfa.alpha_remap[i]);
    }

-    return vector<u16>(rv.begin(), rv.end());
+    return rv;
 }

 static
@@ -254,7 +258,7 @@ dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {

    u16 top_remap = raw.alpha_remap[TOP];

-    ue2::unordered_set<dstate_id_t> seen;
+    std::unordered_set<dstate_id_t> seen;
    while (true) {
        seen.insert(s);
        DEBUG_PRINTF("basis %hu\n", s);
@@ -288,7 +292,7 @@ dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {

 static
 set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
-                                    const AccelScheme &ei) {
+                             const AccelScheme &ei) {
    DEBUG_PRINTF("looking for region around %hu\n", base);

    set<dstate_id_t> region = {base};
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@@ -44,6 +44,8 @@
 #include "util/simd_types.h"

 #include <cstdio>
+#include <map>
+#include <set>
 #include <vector>

 #ifndef DUMP_SUPPORT
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@@ -31,7 +31,7 @@

 #include "ue2common.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 union AccelAux;

--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -71,7 +71,7 @@ void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
 void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
    const Castle *c = (const Castle *)getImplNfa(nfa);

-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");

    fprintf(f, "Castle multi-tenant repeat engine\n");
    fprintf(f, "\n");
@@ -117,7 +117,6 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
        fprintf(f, "Sub %u:\n", i);
        dumpTextSubCastle(sub[i], f);
    }
-    fclose(f);
 }

 } // namespace ue2
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -48,11 +48,11 @@
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/verify_types.h"
 #include "grey.h"

@@ -153,13 +153,11 @@ static
 void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
                     const CliqueVertex &cv, const set<u32> &group) {
    u32 id = g[cv].stateId;
-    ue2::unordered_set<u32> neighborId;

    // find neighbors for cv
    for (const auto &v : adjacent_vertices_range(cv, g)) {
-        if (g[v].stateId != id && contains(group, g[v].stateId)){
+        if (g[v].stateId != id && contains(group, g[v].stateId)) {
            neighbor.push_back(g[v].stateId);
-            neighborId.insert(g[v].stateId);
            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
        }
    }
@@ -772,7 +770,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = c1.add(pr);
+        u32 new_top = c1.merge(pr);
        top_map[top] = new_top;
        DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
    }
@@ -883,7 +881,7 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
 }

 bool requiresDedupe(const CastleProto &proto,
-                    const ue2::flat_set<ReportID> &reports) {
+                    const flat_set<ReportID> &reports) {
    for (const auto &report : reports) {
        auto it = proto.report_map.find(report);
        if (it == end(proto.report_map)) {
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -39,11 +39,12 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/bytecode_ptr.h"
 #include "util/depth.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 #include <map>
 #include <memory>
 #include <set>
+#include <unordered_map>
 #include <vector>

 struct NFA;
@@ -89,7 +90,7 @@ struct CastleProto {
    std::map<u32, PureRepeat> repeats;

    /** \brief Mapping from report to associated tops. */
-    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
+    std::unordered_map<ReportID, flat_set<u32>> report_map;

    /**
     * \brief Next top id to use. Repeats may be removed without top remapping,
@@ -127,7 +128,9 @@ buildCastle(const CastleProto &proto,
            const CompileContext &cc, const ReportManager &rm);

 /**
- * \brief Merge two CastleProto prototypes together, if possible.
+ * \brief Merge two CastleProto prototypes together, if possible. If a
+ * particular repeat from c2 is already in c1, then it will be reused rather
+ * than adding a duplicate repeat.
 *
 * Returns true if merge of all repeats in c2 into c1 succeeds, and fills
 * mapping with the repeat indices.
@@ -155,7 +158,7 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
 * of the reports in the given set.
 */
 bool requiresDedupe(const CastleProto &proto,
-                    const ue2::flat_set<ReportID> &reports);
+                    const flat_set<ReportID> &reports);

 /**
 * \brief Build an NGHolder from a CastleProto.
--- a/src/nfa/dfa_build_strat.cpp
+++ b/src/nfa/dfa_build_strat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,9 @@

 namespace ue2 {

-// prevent weak vtables for raw_report_info, dfa_build_strat and raw_dfa
+// prevent weak vtables for raw_report_info, dfa_build_strat
 raw_report_info::~raw_report_info() {}

 dfa_build_strat::~dfa_build_strat() {}

-raw_dfa::~raw_dfa() {}
-
 } // namespace ue2
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@@ -59,12 +59,13 @@
 #include "dfa_min.h"

 #include "grey.h"
+#include "mcclellancompile_util.h"
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/noncopyable.h"
 #include "util/partitioned_set.h"
-#include "util/ue2_containers.h"

 #include <algorithm>
 #include <functional>
@@ -299,6 +300,10 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
        return;
    }

+    if (is_dead(rdfa)) {
+        DEBUG_PRINTF("dfa is empty\n");
+    }
+
    UNUSED const size_t states_before = rdfa.states.size();

    HopcroftInfo info(rdfa);
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -37,11 +37,11 @@
 #include "nfa_internal.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/verify_types.h"

 #include "ue2common.h"
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@@ -33,7 +33,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/order_check.h"

 #include <map>
--- a/src/nfa/goughcompile_dump.cpp
+++ b/src/nfa/goughcompile_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -32,8 +32,10 @@
 #include "goughcompile_internal.h"
 #include "grey.h"
 #include "util/container.h"
+#include "util/dump_util.h"
 #include "util/graph_range.h"

+#include <sstream>
 #include <string>

 #ifndef DUMP_SUPPORT
@@ -66,10 +68,7 @@ string dump_name(const gough_edge_id &e) {

 static
 void dump_graph(const GoughGraph &g, const string &base, const Grey &grey) {
-    stringstream ss;
-    ss << grey.dumpPath << "gough_" << base << ".dot";
-
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "gough_" + base + ".dot", "w");

    fprintf(f, "digraph NFA {\n");
    fprintf(f, "rankdir=LR;\n");
@@ -94,8 +93,6 @@ void dump_graph(const GoughGraph &g, const string &base, const Grey &grey) {
                dump_name(g[s]).c_str(), dump_name(g[t]).c_str());
    }
    fprintf(f, "}\n");
-
-    fclose(f);
 }

 static
@@ -133,9 +130,7 @@ set<const GoughSSAVar *> uses(const GoughEdgeProps &ep) {
 static
 void dump_var_mapping(const GoughGraph &g, const string &base,
                      const Grey &grey) {
-    stringstream ss;
-    ss << grey.dumpPath << "gough_" << base << "_vars.txt";
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "gough_" + base + "_vars.txt", "w");
    for (auto v : vertices_range(g)) {
        set<const GoughSSAVar *> used = uses(g[v]);
        if (g[v].vars.empty() && used.empty()) {
@@ -180,7 +175,6 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
            fprintf(f, "\n");
        }
    }
-    fclose(f);
 }

 static
@@ -220,12 +214,7 @@ void gather_vars(const GoughGraph &g, vector<const GoughSSAVar *> *vars,

 static
 void dump_vars(const GoughGraph &g, const string &base, const Grey &grey) {
-    FILE *f;
-    {
-        stringstream ss;
-        ss << grey.dumpPath << "gough_" << base << "_vars.dot";
-        f = fopen(ss.str().c_str(), "w");
-    }
+    StdioFile f(grey.dumpPath + "gough_" + base + "_vars.dot", "w");
    fprintf(f, "digraph NFA {\n");
    fprintf(f, "rankdir=LR;\n");
    fprintf(f, "size=\"11.5,8\"\n");
@@ -271,7 +260,6 @@ void dump_vars(const GoughGraph &g, const string &base, const Grey &grey) {
    }

    fprintf(f, "}\n");
-    fclose(f);
 }

 void dump(const GoughGraph &g, const string &base, const Grey &grey) {
@@ -317,18 +305,11 @@ void dump_blocks(const map<gough_edge_id, vector<gough_ins>> &blocks,
        return;
    }

-    FILE *f;
-    {
-        stringstream ss;
-        ss << grey.dumpPath <<  "gough_" << base << "_programs.txt";
-        f = fopen(ss.str().c_str(), "w");
-    }
+    StdioFile f(grey.dumpPath + "gough_" + base + "_programs.txt", "w");

    for (const auto &m : blocks) {
        dump_block(f, m.first, m.second);
    }
-
-    fclose(f);
 }

 } // namespace ue2
--- a/src/nfa/goughcompile_internal.h
+++ b/src/nfa/goughcompile_internal.h
@@ -33,9 +33,9 @@
 #include "mcclellancompile.h"
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/flat_containers.h"
 #include "util/noncopyable.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"

 #include <map>
 #include <memory>
@@ -106,10 +106,10 @@ struct GoughSSAVarJoin;
 struct GoughSSAVar : noncopyable {
    GoughSSAVar(void) : seen(false), slot(INVALID_SLOT) {}
    virtual ~GoughSSAVar();
-    const ue2::flat_set<GoughSSAVar *> &get_inputs() const {
+    const flat_set<GoughSSAVar *> &get_inputs() const {
        return inputs;
    }
-    const ue2::flat_set<GoughSSAVarWithInputs *> &get_outputs() const {
+    const flat_set<GoughSSAVarWithInputs *> &get_outputs() const {
        return outputs;
    }
    virtual void replace_input(GoughSSAVar *old_v, GoughSSAVar *new_v) = 0;
@@ -127,8 +127,8 @@ struct GoughSSAVar : noncopyable {
        clear_outputs();
    }
 protected:
-    ue2::flat_set<GoughSSAVar *> inputs;
-    ue2::flat_set<GoughSSAVarWithInputs *> outputs;
+    flat_set<GoughSSAVar *> inputs;
+    flat_set<GoughSSAVarWithInputs *> outputs;
    friend struct GoughSSAVarWithInputs;
    friend struct GoughSSAVarMin;
    friend struct GoughSSAVarJoin;
@@ -184,16 +184,14 @@ struct GoughSSAVarJoin : public GoughSSAVarWithInputs {

    void add_input(GoughSSAVar *v, GoughEdge prev);

-    const ue2::flat_set<GoughEdge> &get_edges_for_input(GoughSSAVar *input)
-        const;
-    const std::map<GoughSSAVar *, ue2::flat_set<GoughEdge> > &get_input_map()
-        const;
+    const flat_set<GoughEdge> &get_edges_for_input(GoughSSAVar *input) const;
+    const std::map<GoughSSAVar *, flat_set<GoughEdge>> &get_input_map() const;

 protected:
    void remove_input_raw(GoughSSAVar *v) override;

 private:
-    std::map<GoughSSAVar *, ue2::flat_set<GoughEdge>> input_map;
+    std::map<GoughSSAVar *, flat_set<GoughEdge>> input_map;
 };

 struct gough_accel_state_info {
--- a/src/nfa/goughcompile_reg.cpp
+++ b/src/nfa/goughcompile_reg.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -32,10 +32,10 @@
 #include "gough_internal.h"
 #include "grey.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/order_check.h"
-#include "util/ue2_containers.h"

 #include "ue2common.h"

@@ -235,7 +235,7 @@ void handle_pending_vertices(GoughSSAVar *def, const GoughGraph &g,
    if (contains(aux.containing_v, def)) {
        def_v = aux.containing_v.at(def);
    }
-    ue2::unordered_set<GoughVertex> done;
+    unordered_set<GoughVertex> done;
    while (!pending_vertex.empty()) {
        GoughVertex current = *pending_vertex.begin();
        pending_vertex.erase(current);
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -353,22 +353,14 @@ void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {

 void nfaExecGough16_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == GOUGH_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecGough16_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecGough16_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecGough16_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecGough16_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }

 void nfaExecGough8_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == GOUGH_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecGough8_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecGough8_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecGough8_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecGough8_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }

 } // namespace ue2
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -71,47 +71,40 @@ void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_DOT);
    const lbr_dot *ld = (const lbr_dot *)getImplNfa(nfa);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&ld->common, f);
    fprintf(f, "DOT model\n");
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_VERM);
    const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
-
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-
+    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&lv->common, f);
    fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_NVERM);
    const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
-
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-
+    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&lv->common, f);
    fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_SHUF);

-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");

    const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa);
    lbrDumpCommon(&ls->common, f);
@@ -122,14 +115,13 @@ void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {
            describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_TRUF);

-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");

    const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa);
    lbrDumpCommon(&lt->common, f);
@@ -140,7 +132,6 @@ void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {
            describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 } // namespace ue2
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -53,11 +53,13 @@
 #include "util/charreach.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"
 #include "util/order_check.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"
-#include "util/ue2_containers.h"

 #include <algorithm>
 #include <cassert>
@@ -96,18 +98,20 @@ struct precalcAccel {
 };

 struct limex_accel_info {
-    ue2::unordered_set<NFAVertex> accelerable;
+    unordered_set<NFAVertex> accelerable;
    map<NFAStateSet, precalcAccel> precalc;
-    ue2::unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
-    ue2::unordered_map<NFAVertex, AccelScheme> accel_map;
+    unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
+    unordered_map<NFAVertex, AccelScheme> accel_map;
 };

 static
-map<NFAVertex, NFAStateSet>
-reindexByStateId(const map<NFAVertex, NFAStateSet> &in, const NGHolder &g,
-                 const ue2::unordered_map<NFAVertex, u32> &state_ids,
+unordered_map<NFAVertex, NFAStateSet>
+reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
+                 const NGHolder &g,
+                 const unordered_map<NFAVertex, u32> &state_ids,
                 const u32 num_states) {
-    map<NFAVertex, NFAStateSet> out;
+    unordered_map<NFAVertex, NFAStateSet> out;
+    out.reserve(in.size());

    vector<u32> indexToState(num_vertices(g), NO_STATE);
    for (const auto &m : state_ids) {
@@ -137,18 +141,20 @@ reindexByStateId(const map<NFAVertex, NFAStateSet> &in, const NGHolder &g,

 struct build_info {
    build_info(NGHolder &hi,
-               const ue2::unordered_map<NFAVertex, u32> &states_in,
+               const unordered_map<NFAVertex, u32> &states_in,
               const vector<BoundedRepeatData> &ri,
-               const map<NFAVertex, NFAStateSet> &rsmi,
-               const map<NFAVertex, NFAStateSet> &smi,
+               const unordered_map<NFAVertex, NFAStateSet> &rsmi,
+               const unordered_map<NFAVertex, NFAStateSet> &smi,
               const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
-               bool dai, bool sci, const CompileContext &cci,
-               u32 nsi)
-        : h(hi), state_ids(states_in), repeats(ri), tops(ti), zombies(zi),
-          do_accel(dai), stateCompression(sci), cc(cci),
+               bool dai, bool sci, const CompileContext &cci, u32 nsi)
+        : h(hi), state_ids(states_in), repeats(ri), tops(ti), tugs(nsi),
+          zombies(zi), do_accel(dai), stateCompression(sci), cc(cci),
          num_states(nsi) {
        for (const auto &br : repeats) {
-            insert(&tugs, br.tug_triggers);
+            for (auto v : br.tug_triggers) {
+                assert(state_ids.at(v) != NO_STATE);
+                tugs.set(state_ids.at(v));
+            }
            br_cyclic[br.cyclic] =
                BoundedRepeatSummary(br.repeatMin, br.repeatMax);
        }
@@ -160,15 +166,15 @@ struct build_info {
    }

    NGHolder &h;
-    const ue2::unordered_map<NFAVertex, u32> &state_ids;
+    const unordered_map<NFAVertex, u32> &state_ids;
    const vector<BoundedRepeatData> &repeats;

    // Squash maps; state sets are indexed by state_id.
-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;

    const map<u32, set<NFAVertex>> &tops;
-    ue2::unordered_set<NFAVertex> tugs;
+    NFAStateSet tugs;
    map<NFAVertex, BoundedRepeatSummary> br_cyclic;
    const set<NFAVertex> &zombies;
    bool do_accel;
@@ -238,7 +244,7 @@ bool isLimitedTransition(int from, int to, int maxshift) {

 // Fill a bit mask
 template<class Mask>
-void maskFill(Mask &m, char c) {
+void maskFill(Mask &m, u8 c) {
    memset(&m, c, sizeof(m));
 }

@@ -478,7 +484,7 @@ bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
 static
 void nfaFindAccelSchemes(const NGHolder &g,
                         const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-                         ue2::unordered_map<NFAVertex, AccelScheme> *out) {
+                         unordered_map<NFAVertex, AccelScheme> *out) {
    vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);

    NFAVertex sds_or_proxy = get_sds_or_proxy(g);
@@ -503,8 +509,8 @@ void nfaFindAccelSchemes(const NGHolder &g,
 }

 struct fas_visitor : public boost::default_bfs_visitor {
-    fas_visitor(const ue2::unordered_map<NFAVertex, AccelScheme> &am_in,
-                ue2::unordered_map<NFAVertex, AccelScheme> *out_in)
+    fas_visitor(const unordered_map<NFAVertex, AccelScheme> &am_in,
+                unordered_map<NFAVertex, AccelScheme> *out_in)
        : accel_map(am_in), out(out_in) {}

    void discover_vertex(NFAVertex v, const NGHolder &) {
@@ -515,13 +521,13 @@ struct fas_visitor : public boost::default_bfs_visitor {
            throw this; /* done */
        }
    }
-    const ue2::unordered_map<NFAVertex, AccelScheme> &accel_map;
-    ue2::unordered_map<NFAVertex, AccelScheme> *out;
+    const unordered_map<NFAVertex, AccelScheme> &accel_map;
+    unordered_map<NFAVertex, AccelScheme> *out;
 };

 static
 void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
-                       ue2::unordered_map<NFAVertex, AccelScheme> *accel_map) {
+                       unordered_map<NFAVertex, AccelScheme> *accel_map) {
    /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
     * else should be ditched. We use a simple BFS to choose accel states near
     * the start. */
@@ -541,14 +547,12 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
        tempEdges.push_back(e); // Remove edge later.
    }

-    ue2::unordered_map<NFAVertex, AccelScheme> out;
+    unordered_map<NFAVertex, AccelScheme> out;

    try {
-        vector<boost::default_color_type> colour(num_vertices(g));
        boost::breadth_first_search(g, g.start,
-            visitor(fas_visitor(*accel_map, &out))
-                .color_map(make_iterator_property_map(colour.begin(),
-                                                      get(vertex_index, g))));
+                                    visitor(fas_visitor(*accel_map, &out))
+                                        .color_map(make_small_color_map(g)));
    } catch (fas_visitor *) {
        ; /* found max accel_states */
    }
@@ -983,16 +987,18 @@ u32 addSquashMask(const build_info &args, const NFAVertex &v,
    return idx;
 }

+using ReportListCache = ue2_unordered_map<vector<ReportID>, u32>;
+
 static
 u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
-               unordered_map<vector<ReportID>, u32> &reportListCache) {
+               ReportListCache &reports_cache) {
    assert(!r.empty());

    vector<ReportID> my_reports(begin(r), end(r));
    my_reports.push_back(MO_INVALID_IDX); // sentinel

-    auto cache_it = reportListCache.find(my_reports);
-    if (cache_it != end(reportListCache)) {
+    auto cache_it = reports_cache.find(my_reports);
+    if (cache_it != end(reports_cache)) {
        u32 offset = cache_it->second;
        DEBUG_PRINTF("reusing cached report list at %u\n", offset);
        return offset;
@@ -1008,13 +1014,12 @@ u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,

    u32 offset = verify_u32(reports.size());
    insert(&reports, reports.end(), my_reports);
-    reportListCache.emplace(move(my_reports), offset);
+    reports_cache.emplace(move(my_reports), offset);
    return offset;
 }

 static
-void buildAcceptsList(const build_info &args,
-                      unordered_map<vector<ReportID>, u32> &reports_cache,
+void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
                      vector<NFAVertex> &verts, vector<NFAAccept> &accepts,
                      vector<ReportID> &reports, vector<NFAStateSet> &squash) {
    if (verts.empty()) {
@@ -1052,8 +1057,7 @@ void buildAcceptsList(const build_info &args,
 }

 static
-void buildAccepts(const build_info &args,
-                  unordered_map<vector<ReportID>, u32> &reports_cache,
+void buildAccepts(const build_info &args, ReportListCache &reports_cache,
                  NFAStateSet &acceptMask, NFAStateSet &acceptEodMask,
                  vector<NFAAccept> &accepts, vector<NFAAccept> &acceptsEod,
                  vector<ReportID> &reports, vector<NFAStateSet> &squash) {
@@ -1120,7 +1124,7 @@ u32 uncompressedStateSize(u32 num_states) {

 static
 u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,
-                        const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+                        const unordered_map<NFAVertex, u32> &state_ids) {
    // Shrink state requirement to enough to fit the compressed largest reach.
    vector<u32> allreach(N_CHARS, 0);

@@ -1191,7 +1195,7 @@ bool hasSquashableInitDs(const build_info &args) {

 static
 bool hasInitDsStates(const NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+                     const unordered_map<NFAVertex, u32> &state_ids) {
    if (state_ids.at(h.startDs) != NO_STATE) {
        return true;
    }
@@ -1359,17 +1363,16 @@ struct ExceptionProto {
 };

 static
-u32 buildExceptionMap(const build_info &args,
-                      unordered_map<vector<ReportID>, u32> &reports_cache,
-                      const ue2::unordered_set<NFAEdge> &exceptional,
+u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
+                      const unordered_set<NFAEdge> &exceptional,
                      map<ExceptionProto, vector<u32>> &exceptionMap,
                      vector<ReportID> &reportList) {
    const NGHolder &h = args.h;
    const u32 num_states = args.num_states;
    u32 exceptionCount = 0;

-    ue2::unordered_map<NFAVertex, u32> pos_trigger;
-    ue2::unordered_map<NFAVertex, u32> tug_trigger;
+    unordered_map<NFAVertex, u32> pos_trigger;
+    unordered_map<NFAVertex, u32> tug_trigger;

    for (u32 i = 0; i < args.repeats.size(); i++) {
        const BoundedRepeatData &br = args.repeats[i];
@@ -1518,18 +1521,14 @@ u32 depth_to_u32(const depth &d) {
 }

 static
-bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
-                             const build_info &args, u32 maxShift) {
-    NFAVertex from = source(e, h);
-    NFAVertex to = target(e, h);
-    u32 f = args.state_ids.at(from);
-    u32 t = args.state_ids.at(to);
-    if (!isLimitedTransition(f, t, maxShift)) {
+bool isExceptionalTransition(u32 from, u32 to, const build_info &args,
+                             u32 maxShift) {
+    if (!isLimitedTransition(from, to, maxShift)) {
        return true;
    }

    // All transitions out of a tug trigger are exceptional.
-    if (contains(args.tugs, from)) {
+    if (args.tugs.test(from)) {
        return true;
    }
    return false;
@@ -1545,7 +1544,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
        if (from == NO_STATE || to == NO_STATE) {
            continue;
        }
-        if (!isExceptionalTransition(h, e, args, MAX_SHIFT_AMOUNT)) {
+        if (!isExceptionalTransition(from, to, args, MAX_SHIFT_AMOUNT)) {
            shiftMask |= (1UL << (to - from));
        }
    }
@@ -1574,7 +1573,7 @@ int getLimexScore(const build_info &args, u32 nShifts) {
        if (from == NO_STATE || to == NO_STATE) {
            continue;
        }
-        if (isExceptionalTransition(h, e, args, maxVarShift)) {
+        if (isExceptionalTransition(from, to, args, maxVarShift)) {
            exceptionalStates.set(from);
        }
    }
@@ -1615,9 +1614,7 @@ bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
    // top, looking for a cyclic path consisting of vertices of dot reach. If
    // one exists, than the NFA cannot die after this top is triggered.

-    vector<boost::default_color_type> colours(num_vertices(h));
-    auto colour_map = boost::make_iterator_property_map(colours.begin(),
-                                                        get(vertex_index, h));
+    auto colour_map = make_small_color_map(h);

    struct CycleFound {};
    struct CannotDieVisitor : public boost::default_dfs_visitor {
@@ -1848,10 +1845,9 @@ struct Factory {
            maskSetBit(limex->repeatCyclicMask, cyclic);
        }
        /* also include tugs in repeat cyclic mask */
-        for (NFAVertex v : args.tugs) {
-            u32 v_state = args.state_ids.at(v);
-            assert(v_state != NO_STATE);
-            maskSetBit(limex->repeatCyclicMask, v_state);
+        for (size_t i = args.tugs.find_first(); i != args.tugs.npos;
+             i = args.tugs.find_next(i)) {
+            maskSetBit(limex->repeatCyclicMask, i);
        }
    }

@@ -1872,7 +1868,7 @@ struct Factory {
            // We check for exceptional transitions here, as we don't want tug
            // trigger transitions emitted as limited transitions (even if they
            // could be in this model).
-            if (!isExceptionalTransition(h, e, args, maxShift)) {
+            if (!isExceptionalTransition(from, to, args, maxShift)) {
                u32 shift = to - from;
                if ((shiftMask & (1UL << shift)) == 0UL) {
                    shiftMask |= (1UL << shift);
@@ -1896,7 +1892,7 @@ struct Factory {

    static
    void findExceptionalTransitions(const build_info &args,
-                                    ue2::unordered_set<NFAEdge> &exceptional,
+                                    unordered_set<NFAEdge> &exceptional,
                                    u32 maxShift) {
        const NGHolder &h = args.h;

@@ -1907,7 +1903,7 @@ struct Factory {
                continue;
            }

-            if (isExceptionalTransition(h, e, args, maxShift)) {
+            if (isExceptionalTransition(from, to, args, maxShift)) {
                exceptional.insert(e);
            }
        }
@@ -2171,9 +2167,9 @@ struct Factory {

        // We track report lists that have already been written into the global
        // list in case we can reuse them.
-        unordered_map<vector<ReportID>, u32> reports_cache;
+        ReportListCache reports_cache;

-        ue2::unordered_set<NFAEdge> exceptional;
+        unordered_set<NFAEdge> exceptional;
        u32 shiftCount = findBestNumOfVarShifts(args);
        assert(shiftCount);
        u32 maxShift = findMaxVarShift(args, shiftCount);
@@ -2377,10 +2373,10 @@ MAKE_LIMEX_TRAITS(512)
 // Some sanity tests, called by an assertion in generate().
 static UNUSED
 bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids,
+            const unordered_map<NFAVertex, u32> &state_ids,
            u32 num_states) {
-    ue2::unordered_set<u32> seen;
-    ue2::unordered_set<NFAVertex> top_starts;
+    unordered_set<u32> seen;
+    unordered_set<NFAVertex> top_starts;
    for (const auto &vv : tops | map_values) {
        insert(&top_starts, vv);
    }
@@ -2427,7 +2423,7 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
 #endif // NDEBUG

 static
-u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
+u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
    u32 rv = 0;
    for (const auto &m : state_ids) {
        DEBUG_PRINTF("state %u\n", m.second);
@@ -2440,14 +2436,14 @@ u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
 }

 bytecode_ptr<NFA> generate(NGHolder &h,
-                           const ue2::unordered_map<NFAVertex, u32> &states,
-                           const vector<BoundedRepeatData> &repeats,
-                           const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                           const map<NFAVertex, NFAStateSet> &squashMap,
-                           const map<u32, set<NFAVertex>> &tops,
-                           const set<NFAVertex> &zombies, bool do_accel,
-                           bool stateCompression, u32 hint,
-                           const CompileContext &cc) {
+                const unordered_map<NFAVertex, u32> &states,
+                const vector<BoundedRepeatData> &repeats,
+                const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+                const unordered_map<NFAVertex, NFAStateSet> &squashMap,
+                const map<u32, set<NFAVertex>> &tops,
+                const set<NFAVertex> &zombies, bool do_accel,
+                bool stateCompression, u32 hint,
+                const CompileContext &cc) {
    const u32 num_states = max_state(states) + 1;
    DEBUG_PRINTF("total states: %u\n", num_states);

@@ -2510,13 +2506,13 @@ bytecode_ptr<NFA> generate(NGHolder &h,
 }

 u32 countAccelStates(NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &states,
-                     const vector<BoundedRepeatData> &repeats,
-                     const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                     const map<NFAVertex, NFAStateSet> &squashMap,
-                     const map<u32, set<NFAVertex>> &tops,
-                     const set<NFAVertex> &zombies,
-                     const CompileContext &cc) {
+                const unordered_map<NFAVertex, u32> &states,
+                const vector<BoundedRepeatData> &repeats,
+                const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+                const unordered_map<NFAVertex, NFAStateSet> &squashMap,
+                const map<u32, set<NFAVertex>> &tops,
+                const set<NFAVertex> &zombies,
+                const CompileContext &cc) {
    const u32 num_states = max_state(states) + 1;
    DEBUG_PRINTF("total states: %u\n", num_states);

--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -34,15 +34,16 @@
 #ifndef LIMEX_COMPILE_H
 #define LIMEX_COMPILE_H

-#include <map>
-#include <memory>
-#include <vector>
-
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_squash.h" // for NFAStateSet
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"
+
+#include <set>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>

 struct NFA;

@@ -69,16 +70,16 @@ struct CompileContext;
 * graph.
 */
 bytecode_ptr<NFA> generate(NGHolder &g,
-                        const ue2::unordered_map<NFAVertex, u32> &states,
-                        const std::vector<BoundedRepeatData> &repeats,
-                        const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
-                        const std::map<NFAVertex, NFAStateSet> &squashMap,
-                        const std::map<u32, std::set<NFAVertex>> &tops,
-                        const std::set<NFAVertex> &zombies,
-                        bool do_accel,
-                        bool stateCompression,
-                        u32 hint,
-                        const CompileContext &cc);
+            const std::unordered_map<NFAVertex, u32> &states,
+            const std::vector<BoundedRepeatData> &repeats,
+            const std::unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+            const std::unordered_map<NFAVertex, NFAStateSet> &squashMap,
+            const std::map<u32, std::set<NFAVertex>> &tops,
+            const std::set<NFAVertex> &zombies,
+            bool do_accel,
+            bool stateCompression,
+            u32 hint,
+            const CompileContext &cc);

 /**
 * \brief For a given graph, count the number of accelerable states it has.
@@ -87,13 +88,13 @@ bytecode_ptr<NFA> generate(NGHolder &g,
 * implementable.
 */
 u32 countAccelStates(NGHolder &h,
-                     const ue2::unordered_map<NFAVertex, u32> &states,
-                     const std::vector<BoundedRepeatData> &repeats,
-                     const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
-                     const std::map<NFAVertex, NFAStateSet> &squashMap,
-                     const std::map<u32, std::set<NFAVertex>> &tops,
-                     const std::set<NFAVertex> &zombies,
-                     const CompileContext &cc);
+            const std::unordered_map<NFAVertex, u32> &states,
+            const std::vector<BoundedRepeatData> &repeats,
+            const std::unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
+            const std::unordered_map<NFAVertex, NFAStateSet> &squashMap,
+            const std::map<u32, std::set<NFAVertex>> &tops,
+            const std::set<NFAVertex> &zombies,
+            const CompileContext &cc);

 } // namespace ue2

--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -487,25 +487,24 @@ void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
    }
 }

+template<typename limex_type>
+static
+void dumpLimexDot(const NFA *nfa, const limex_type *limex, FILE *f) {
+    dumpDotPreamble(f);
+    u32 state_count = nfa->nPositions;
+    dumpVertexDotInfo(limex, state_count, f, limex_labeller<limex_type>(limex));
+    for (u32 i = 0; i < state_count; i++) {
+        dumpLimDotInfo(limex, i, f);
+        dumpExDotInfo(limex, i, f);
+    }
+    dumpDotTrailer(f);
+}
+
 #define LIMEX_DUMP_FN(size)                                                    \
    void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) {       \
        auto limex = (const LimExNFA##size *)getImplNfa(nfa);                  \
-                                                                               \
-        FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");                \
-        dumpLimexText(limex, f);                                               \
-        fclose(f);                                                             \
-                                                                               \
-        f = fopen_or_throw((base + ".dot").c_str(), "w");                      \
-        dumpDotPreamble(f);                                                    \
-        u32 state_count = nfa->nPositions;                                     \
-        dumpVertexDotInfo(limex, state_count, f,                               \
-                          limex_labeller<LimExNFA##size>(limex));              \
-        for (u32 i = 0; i < state_count; i++) {                                \
-            dumpLimDotInfo(limex, i, f);                                       \
-            dumpExDotInfo(limex, i, f);                                        \
-        }                                                                      \
-        dumpDotTrailer(f);                                                     \
-        fclose(f);                                                             \
+        dumpLimexText(limex, StdioFile(base + ".txt", "w"));                   \
+        dumpLimexDot(nfa, limex, StdioFile(base + ".dot", "w"));               \
    }

 LIMEX_DUMP_FN(32)
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -46,7 +46,7 @@
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/unaligned.h"
 #include "util/verify_types.h"

@@ -288,11 +288,12 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(

        raw_report_list rrl(s.reports, rm, remap_reports);
        DEBUG_PRINTF("non empty r\n");
-        if (rev.find(rrl) != rev.end()) {
-            reports.push_back(rev[rrl]);
+        auto it = rev.find(rrl);
+        if (it != rev.end()) {
+            reports.push_back(it->second);
        } else {
            DEBUG_PRINTF("adding to rl %zu\n", ri->size());
-            rev[rrl] = ri->size();
+            rev.emplace(rrl, ri->size());
            reports.push_back(ri->size());
            ri->rl.push_back(rrl);
        }
@@ -306,13 +307,14 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(

        DEBUG_PRINTF("non empty r eod\n");
        raw_report_list rrl(s.reports_eod, rm, remap_reports);
-        if (rev.find(rrl) != rev.end()) {
-            reports_eod.push_back(rev[rrl]);
+        auto it = rev.find(rrl);
+        if (it != rev.end()) {
+            reports_eod.push_back(it->second);
            continue;
        }

        DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
-        rev[rrl] = ri->size();
+        rev.emplace(rrl, ri->size());
        reports_eod.push_back(ri->size());
        ri->rl.push_back(rrl);
    }
@@ -325,10 +327,9 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
        *arbReport = 0;
    }

-
    /* if we have only a single report id generated from all accepts (not eod)
     * we can take some short cuts */
-    set<ReportID> reps;
+    flat_set<ReportID> reps;

    for (u32 rl_index : reports) {
        if (rl_index == MO_INVALID_IDX) {
@@ -897,7 +898,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
    }

    u32 self_loop_width = 0;
-    const dstate curr_raw = info.states[curr_id];
+    const dstate &curr_raw = info.states[curr_id];
    for (unsigned i = 0; i < N_CHARS; i++) {
        if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
            self_loop_width++;
@@ -914,33 +915,6 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
    info.extra[curr_id].shermanState = true;
 }

-/*
- * Calls accessible outside this module.
- */
-
-u16 raw_dfa::getImplAlphaSize() const {
-    return alpha_size - N_SPECIAL_SYMBOL;
-}
-
-void raw_dfa::stripExtraEodReports(void) {
-    /* if a state generates a given report as a normal accept - then it does
-     * not also need to generate an eod report for it */
-    for (dstate &ds : states) {
-        for (const ReportID &report : ds.reports) {
-            ds.reports_eod.erase(report);
-        }
-    }
-}
-
-bool raw_dfa::hasEodReports(void) const {
-    for (const dstate &ds : states) {
-        if (!ds.reports_eod.empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
    symbol_t alphasize = raw.getImplAlphaSize();
@@ -964,7 +938,8 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                                     const CompileContext &cc,
                                     bool trust_daddy_states,
                                     set<dstate_id_t> *accel_states) {
-    u16 total_daddy = 0;
+    assert(!is_dead(raw));
+
    dfa_info info(strat);
    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;

@@ -974,21 +949,24 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
    }

    bool has_eod_reports = raw.hasEodReports();
-    bool any_cyclic_near_anchored_state = is_cyclic_near(raw,
-                                                         raw.start_anchored);
-
-    for (u32 i = 0; i < info.size(); i++) {
-        find_better_daddy(info, i, using8bit, any_cyclic_near_anchored_state,
-                          trust_daddy_states, cc.grey);
-        total_daddy += info.extra[i].daddytaken;
-    }
-
-    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                 info.size() * info.impl_alpha_size, info.size(),
-                 info.impl_alpha_size);

    bytecode_ptr<NFA> nfa;
    if (!using8bit) {
+        u16 total_daddy = 0;
+        bool any_cyclic_near_anchored_state
+            = is_cyclic_near(raw, raw.start_anchored);
+
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i, using8bit,
+                              any_cyclic_near_anchored_state,
+                              trust_daddy_states, cc.grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
+
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+
        nfa = mcclellanCompile16(info, cc, accel_states);
    } else {
        nfa = mcclellanCompile8(info, cc, accel_states);
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -33,7 +33,6 @@
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/bytecode_ptr.h"
-#include "util/ue2_containers.h"

 #include <memory>
 #include <vector>
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -30,12 +30,11 @@

 #include "rdfa.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/hash.h"
 #include "ue2common.h"

 #include <deque>
-
-#include <boost/functional/hash/hash.hpp>
+#include <map>

 using namespace std;

@@ -127,13 +126,11 @@ u32 remove_leading_dots(raw_dfa &raw) {
 static never_inline
 u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
    vector<u32> &dist = *dist_in;
-    dist.clear();
-    dist.resize(raw.states.size(), ~0U);
+    dist.assign(raw.states.size(), ~0U);

    assert(raw.start_anchored != DEAD_STATE);

-    deque<dstate_id_t> to_visit;
-    to_visit.push_back(raw.start_anchored);
+    deque<dstate_id_t> to_visit = { raw.start_anchored };
    dist[raw.start_anchored] = 0;

    u32 last_d = 0;
@@ -148,8 +145,7 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
        assert(d >= last_d);
        assert(d != ~0U);

-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t t = raw.states[s].next[j];
+        for (dstate_id_t t : raw.states[s].next) {
            if (t == DEAD_STATE) {
                continue;
            }
@@ -187,7 +183,21 @@ bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
        }
    }

-    return changed;
+    if (!changed) {
+        return false;
+    }
+
+    // We may have cleared all reports from the DFA, in which case it should
+    // become empty.
+    if (all_of_in(raw.states, [](const dstate &ds) {
+            return ds.reports.empty() && ds.reports_eod.empty();
+        })) {
+        DEBUG_PRINTF("no reports left at all, dfa is dead\n");
+        raw.start_anchored = DEAD_STATE;
+        raw.start_floating = DEAD_STATE;
+    }
+
+    return true;
 }

 set<ReportID> all_reports(const raw_dfa &rdfa) {
@@ -218,22 +228,18 @@ bool has_non_eod_accepts(const raw_dfa &rdfa) {
 }

 size_t hash_dfa_no_reports(const raw_dfa &rdfa) {
-    using boost::hash_combine;
-    using boost::hash_range;
-
    size_t v = 0;
    hash_combine(v, rdfa.alpha_size);
-    hash_combine(v, hash_range(begin(rdfa.alpha_remap), end(rdfa.alpha_remap)));
+    hash_combine(v, rdfa.alpha_remap);

    for (const auto &ds : rdfa.states) {
-        hash_combine(v, hash_range(begin(ds.next), end(ds.next)));
+        hash_combine(v, ds.next);
    }

    return v;
 }

 size_t hash_dfa(const raw_dfa &rdfa) {
-    using boost::hash_combine;
    size_t v = 0;
    hash_combine(v, hash_dfa_no_reports(rdfa));
    hash_combine(v, all_reports(rdfa));
@@ -272,4 +278,9 @@ bool can_die_early(const raw_dfa &raw, u32 age_limit) {
    return can_die_early(raw, raw.start_anchored, visited, age_limit);
 }

+bool is_dead(const raw_dfa &rdfa) {
+    return rdfa.start_anchored == DEAD_STATE &&
+           rdfa.start_floating == DEAD_STATE;
+}
+
 } // namespace ue2
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -59,6 +59,13 @@ size_t hash_dfa(const raw_dfa &rdfa);

 bool can_die_early(const raw_dfa &raw, u32 age_limit);

+/**
+ * \brief Returns true if this DFA cannot match, i.e. its start state is
+ * DEAD_STATE.
+ */
+bool is_dead(const raw_dfa &rdfa);
+
+
 } // namespace ue2

 #endif
--- a/src/nfa/mcclellandump.cpp
+++ b/src/nfa/mcclellandump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -442,22 +442,14 @@ void nfaExecMcClellan8_dumpText(const NFA *nfa, FILE *f) {

 void nfaExecMcClellan16_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == MCCLELLAN_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecMcClellan16_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecMcClellan16_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecMcClellan16_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecMcClellan16_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }

 void nfaExecMcClellan8_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == MCCLELLAN_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecMcClellan8_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecMcClellan8_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecMcClellan8_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecMcClellan8_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }

 } // namespace ue2
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -45,13 +45,14 @@
 #include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
 #include "util/unaligned.h"
+#include "util/unordered.h"
 #include "util/verify_types.h"

 #include <algorithm>
@@ -383,6 +384,8 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 #define MAX_SHENG_STATES 16
 #define MAX_SHENG_LEAKINESS 0.05

+using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
+
 /**
 * Returns the proportion of strings of length 'depth' which will leave the
 * sheng region when starting at state 'u'.
@@ -390,8 +393,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 static
 double leakiness(const RdfaGraph &g, dfa_info &info,
                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u,
-                 u32 depth,
-                 unordered_map<pair<RdfaVertex, u32>, double> &cache) {
+                 u32 depth, LeakinessCache &cache) {
    double rv = 0;
    if (contains(cache, make_pair(u, depth))) {
        return cache[make_pair(u, depth)];
@@ -426,7 +428,7 @@ double leakiness(const RdfaGraph &g, dfa_info &info,
 static
 double leakiness(const RdfaGraph &g, dfa_info &info,
                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u) {
-    unordered_map<pair<RdfaVertex, u32>, double> cache;
+    LeakinessCache cache;
    double rv = leakiness(g, info, sheng_states, u, 8, cache);
    return rv;
 }
@@ -738,7 +740,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
    assert(info.is_normal(currState.daddy));

    u32 self_loop_width = 0;
-    const dstate curr_raw = info.states[curr_id];
+    const dstate &curr_raw = info.states[curr_id];
    for (unsigned i = 0; i < N_CHARS; i++) {
        if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
            self_loop_width++;
--- a/src/nfa/mcsheng_dump.cpp
+++ b/src/nfa/mcsheng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -394,22 +394,14 @@ void dump_text_8(const NFA *nfa, FILE *f) {

 void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == MCSHENG_NFA_16);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    dump_text_16(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    dump_dot_16(nfa, f);
-    fclose(f);
+    dump_text_16(nfa, StdioFile(base + ".txt", "w"));
+    dump_dot_16(nfa, StdioFile(base + ".dot", "w"));
 }

 void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == MCSHENG_NFA_8);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    dump_text_8(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    dump_dot_8(nfa, f);
-    fclose(f);
+    dump_text_8(nfa, StdioFile(base + ".txt", "w"));
+    dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
 }

 } // namespace ue2
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -132,7 +132,7 @@ void dumpCounter(FILE *f, const mpv_counter_info *c) {
 void nfaExecMpv_dump(const NFA *nfa, const string &base) {
    const mpv *m = (const mpv *)getImplNfa(nfa);

-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");

    fprintf(f, "Puff the Magic Engines\n");
    fprintf(f, "\n");
@@ -154,7 +154,6 @@ void nfaExecMpv_dump(const NFA *nfa, const string &base) {
    }

    dumpTextReverse(nfa, f);
-    fclose(f);
 }

 } // namespace ue2
--- a/src/nfa/rdfa.cpp
+++ b/src/nfa/rdfa.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rdfa.h"
+
+namespace ue2 {
+
+// prevent weak vtables
+raw_dfa::~raw_dfa() {}
+
+void raw_dfa::stripExtraEodReports(void) {
+    /* if a state generates a given report as a normal accept - then it does
+     * not also need to generate an eod report for it */
+    for (dstate &ds : states) {
+        for (const ReportID &report : ds.reports) {
+            ds.reports_eod.erase(report);
+        }
+    }
+}
+
+bool raw_dfa::hasEodReports(void) const {
+    for (const dstate &ds : states) {
+        if (!ds.reports_eod.empty()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace ue2
--- a/src/nfa/rdfa.h
+++ b/src/nfa/rdfa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"

-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 #include <array>
 #include <vector>
@@ -81,7 +81,7 @@ struct raw_dfa {
    explicit raw_dfa(nfa_kind k) : kind(k) {}
    virtual ~raw_dfa();

-    u16 getImplAlphaSize() const;
+    u16 getImplAlphaSize() const { return alpha_size - N_SPECIAL_SYMBOL; }
    virtual void stripExtraEodReports(void);
    bool hasEodReports(void) const;
 };
--- a/src/nfa/rdfa_merge.cpp
+++ b/src/nfa/rdfa_merge.cpp
@@ -36,9 +36,10 @@
 #include "nfagraph/ng_mcclellan_internal.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/make_unique.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"

 #include <algorithm>
 #include <queue>
@@ -53,8 +54,8 @@ namespace {

 class Automaton_Merge {
 public:
-    typedef vector<u16> StateSet;
-    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
+    using StateSet = vector<u16>;
+    using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;

    Automaton_Merge(const raw_dfa *rdfa1, const raw_dfa *rdfa2,
                    const ReportManager *rm_in, const Grey &grey_in)
@@ -289,7 +290,7 @@ unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
    auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);

    Automaton_Merge autom(d1, d2, rm, grey);
-    if (!determinise(autom, rdfa->states, max_states)) {
+    if (determinise(autom, rdfa->states, max_states)) {
        rdfa->start_anchored = autom.start_anchored;
        rdfa->start_floating = autom.start_floating;
        rdfa->alpha_size = autom.alphasize;
@@ -374,7 +375,7 @@ unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,

    DEBUG_PRINTF("merging dfa\n");

-    if (determinise(n, rdfa->states, max_states)) {
+    if (!determinise(n, rdfa->states, max_states)) {
        DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
        return nullptr; /* over state limit */
    }
--- a/src/nfa/shengcompile.h
+++ b/src/nfa/shengcompile.h
@@ -33,7 +33,10 @@
 #include "rdfa.h"
 #include "util/bytecode_ptr.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
+
+#include <memory>
+#include <set>

 struct NFA;

--- a/src/nfa/shengdump.cpp
+++ b/src/nfa/shengdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -41,7 +41,6 @@
 #include "util/dump_util.h"
 #include "util/simd_types.h"

-
 #ifndef DUMP_SUPPORT
 #error No dump support!
 #endif
@@ -267,12 +266,8 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {

 void nfaExecSheng_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == SHENG_NFA);
-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
-    nfaExecSheng_dumpText(nfa, f);
-    fclose(f);
-    f = fopen_or_throw((base + ".dot").c_str(), "w");
-    nfaExecSheng_dumpDot(nfa, f);
-    fclose(f);
+    nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }

 } // namespace ue2
--- a/src/nfa/shufticompile.cpp
+++ b/src/nfa/shufticompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/container.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 #include <array>
 #include <cassert>
--- a/src/nfa/shufticompile.h
+++ b/src/nfa/shufticompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@

 #include "ue2common.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 #include <utility>

--- a/src/nfa/tamarama_dump.cpp
+++ b/src/nfa/tamarama_dump.cpp
@@ -27,7 +27,7 @@
 */

 /** \file
- * \brief Tamarama: container engine for exclusve engines, dump code.
+ * \brief Tamarama: container engine for exclusive engines, dump code.
 */

 #include "config.h"
@@ -54,7 +54,7 @@ namespace ue2 {
 void nfaExecTamarama_dump(const struct NFA *nfa, const string &base) {
    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);

-    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    StdioFile f(base + ".txt", "w");

    fprintf(f, "Tamarama container engine\n");
    fprintf(f, "\n");
@@ -63,7 +63,6 @@ void nfaExecTamarama_dump(const struct NFA *nfa, const string &base) {
    fprintf(f, "\n");
    dumpTextReverse(nfa, f);
    fprintf(f, "\n");
-    fclose(f);

    const u32 *subOffset =
        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
--- a/src/nfa/trufflecompile.cpp
+++ b/src/nfa/trufflecompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,15 @@
 * truffle is always able to represent an entire character class, providing a
 * backstop to other acceleration engines.
 */
+
 #include "trufflecompile.h"
+
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/dump_mask.h"
 #include "util/simd_types.h"

-#include "util/dump_mask.h"
+#include <cstring>

 using namespace std;

--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -44,7 +44,6 @@
 #include "util/graph.h"
 #include "util/noncopyable.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"

 #include <deque>
 #include <map>
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -220,6 +220,52 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
    return shell_edges;
 }

+template<typename GetAdjRange>
+bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &shell,
+                     GetAdjRange adj_range_func) {
+    if (shell.empty()) {
+        DEBUG_PRINTF("no shell\n");
+        return false;
+    }
+
+    NFAVertex exit_vertex = NGHolder::null_vertex();
+    for (auto u : shell) {
+        for (auto v : adj_range_func(u, g)) {
+            if (contains(shell, v)) {
+                continue;
+            }
+            if (!exit_vertex) {
+                exit_vertex = v;
+                continue;
+            }
+            if (exit_vertex == v) {
+                continue;
+            }
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * True if all edges out of vertices in the head shell lead to at most a single
+ * outside vertex, or the inverse for the tail shell.
+ */
+static
+bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &head_shell,
+                     const flat_set<NFAVertex> &tail_shell) {
+    if (shellHasOnePath(g, head_shell, adjacent_vertices_range<NGHolder>)) {
+        DEBUG_PRINTF("head shell has only one path through it\n");
+        return true;
+    }
+    if (shellHasOnePath(g, tail_shell, inv_adjacent_vertices_range<NGHolder>)) {
+        DEBUG_PRINTF("tail shell has only one path into it\n");
+        return true;
+    }
+    return false;
+}
+
 /**
 * Common code called by calc- and recalc- below. Splits the given holder into
 * one or more connected components, adding them to the comps deque.
@@ -250,16 +296,25 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
        return;
    }

+    // Find edges connecting the head and tail shells directly.
    vector<NFAEdge> shell_edges = findShellEdges(*g, head_shell, tail_shell);

    DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n",
                 head_shell.size(), tail_shell.size(), shell_edges.size());

-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
+    // If there are no shell edges and only one path out of the head shell or
+    // into the tail shell, we aren't going to find more than one component.
+    if (shell_edges.empty() && shellHasOnePath(*g, head_shell, tail_shell)) {
+        DEBUG_PRINTF("single component\n");
+        comps.push_back(std::move(g));
+        return;
+    }
+
+    unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
    auto ug = createUnGraph(*g, true, true, old2new);

    // Construct reverse mapping.
-    ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
+    unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
    for (const auto &m : old2new) {
        new2old.emplace(m.second, m.first);
    }
@@ -301,7 +356,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
        DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c);
    }

-    ue2::unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
+    unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
    for (auto &vv : verts) {
        // Shells are in every component.
        vv.insert(vv.end(), begin(head_shell), end(head_shell));
--- a/src/nfagraph/ng_cyclic_redundancy.cpp
+++ b/src/nfagraph/ng_cyclic_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -62,9 +62,11 @@
 #include "ng_prune.h"
 #include "ng_util.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"
+#include "util/graph_small_color_map.h"

+#include <algorithm>
 #include <boost/graph/depth_first_search.hpp>
 #include <boost/graph/reverse_graph.hpp>

@@ -123,17 +125,17 @@ class SearchVisitor : public boost::default_dfs_visitor {

 } // namespace

-template<class Graph>
+template<class Graph, class ColorMap>
 static
 bool searchForward(const Graph &g, const CharReach &reach,
+                   ColorMap &colours,
                   const flat_set<typename Graph::vertex_descriptor> &s,
                   typename Graph::vertex_descriptor w) {
-    map<NFAVertex, boost::default_color_type> colours;
+    colours.fill(small_color::white);
    try {
-        depth_first_visit(g, w, SearchVisitor(reach),
-                     make_assoc_property_map(colours),
-                     VertexInSet<typename Graph::vertex_descriptor, Graph>(s));
-    } catch (SearchFailed&) {
+        depth_first_visit(g, w, SearchVisitor(reach), colours,
+            VertexInSet<typename Graph::vertex_descriptor, Graph>(s));
+    } catch (SearchFailed &) {
        return false;
    }

@@ -162,6 +164,9 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,

    typedef typename Graph::vertex_descriptor vertex_descriptor;

+    // Colour map used for depth_first_visit().
+    auto colours = make_small_color_map(g);
+
    // precalc successors of v.
    flat_set<vertex_descriptor> succ_v;
    insert(&succ_v, adjacent_vertices(v, g));
@@ -200,7 +205,7 @@ bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,

            DEBUG_PRINTF("  - checking w %zu\n", g[w].index);

-            if (!searchForward(g, reach, s, w)) {
+            if (!searchForward(g, reach, colours, s, w)) {
                continue;
            }

@@ -234,6 +239,8 @@ bool cyclicPathRedundancyPass(Graph &g, NGHolder &raw) {
 }

 bool removeCyclicPathRedundancy(NGHolder &g) {
+    assert(hasCorrectlyNumberedVertices(g));
+
    // Forward pass.
    bool f_changed = cyclicPathRedundancyPass(g, g);
    if (f_changed) {
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -34,17 +34,18 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/graph_range.h"
+#include "util/graph_small_color_map.h"

 #include <deque>
 #include <vector>

+#include <boost/graph/breadth_first_search.hpp>
 #include <boost/graph/dag_shortest_paths.hpp>
 #include <boost/graph/depth_first_search.hpp>
-#include <boost/graph/breadth_first_search.hpp>
 #include <boost/graph/filtered_graph.hpp>
+#include <boost/graph/property_maps/constant_property_map.hpp>
 #include <boost/graph/reverse_graph.hpp>
 #include <boost/graph/topological_sort.hpp>
-#include <boost/graph/property_maps/constant_property_map.hpp>
 #include <boost/range/adaptor/reversed.hpp>

 using namespace std;
@@ -137,13 +138,15 @@ vector<bool> findLoopReachable(const Graph &g,
    EdgeSet deadEdges;
    BackEdges<EdgeSet> be(deadEdges);

-    depth_first_search(g, visitor(be).root_vertex(src));
+    auto colors = make_small_color_map(g);
+
+    depth_first_search(g, be, colors, src);
    auto af = make_bad_edge_filter(&deadEdges);
    auto acyclic_g = make_filtered_graph(g, af);

    vector<Vertex> topoOrder; /* actually reverse topological order */
    topoOrder.reserve(deadNodes.size());
-    topological_sort(acyclic_g, back_inserter(topoOrder));
+    topological_sort(acyclic_g, back_inserter(topoOrder), color_map(colors));

    for (const auto &e : deadEdges) {
        size_t srcIdx = g[source(e, g)].index;
@@ -204,14 +207,16 @@ void calcDepthFromSource(const GraphT &g,
                         visitor(make_bfs_visitor(record_distances(
                             make_iterator_property_map(dMin.begin(),
                                                        min_index_map),
-                             boost::on_tree_edge()))));
+                             boost::on_tree_edge())))
+                         .color_map(make_small_color_map(mindist_g)));

    auto max_index_map = get(vertex_index, maxdist_g);

    dag_shortest_paths(maxdist_g, srcVertex,
                       distance_map(make_iterator_property_map(dMax.begin(),
                                                               max_index_map))
-                       .weight_map(make_constant_property<EdgeT>(-1)));
+                       .weight_map(make_constant_property<EdgeT>(-1))
+                       .color_map(make_small_color_map(maxdist_g)));

    for (size_t i = 0; i < numVerts; i++) {
        if (dMin[i] > DIST_UNREACHABLE) {
--- a/src/nfagraph/ng_dominators.cpp
+++ b/src/nfagraph/ng_dominators.cpp
@@ -36,7 +36,6 @@
 #include "ue2common.h"
 #include "ng_holder.h"
 #include "ng_util.h"
-#include "util/ue2_containers.h"

 #include <boost-patched/graph/dominator_tree.hpp> // locally patched version
 #include <boost-patched/graph/reverse_graph.hpp>
--- a/src/nfagraph/ng_dominators.h
+++ b/src/nfagraph/ng_dominators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -36,15 +36,14 @@
 #define NG_DOMINATORS_H

 #include "ng_holder.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>

 namespace ue2 {

-class NGHolder;
+std::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g);

-ue2::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g);
-
-ue2::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g);
+std::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g);

 } // namespace ue2

--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -51,6 +51,7 @@
 #include "smallwrite/smallwrite_dump.h"
 #include "util/bitutils.h"
 #include "util/dump_charclass.h"
+#include "util/dump_util.h"
 #include "util/report.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -175,7 +176,7 @@ public:
        : g(g_in), rm(&rm_in) {}

    NFAWriter(const GraphT &g_in,
-              const ue2::unordered_map<NFAVertex, u32> &region_map_in)
+              const unordered_map<NFAVertex, u32> &region_map_in)
        : g(g_in), region_map(&region_map_in) {}

    void operator()(ostream& os, const VertexT& v) const {
@@ -253,7 +254,7 @@ public:
 private:
    const GraphT &g;
    const ReportManager *rm = nullptr;
-    const ue2::unordered_map<NFAVertex, u32> *region_map = nullptr;
+    const unordered_map<NFAVertex, u32> *region_map = nullptr;
 };
 }

@@ -277,7 +278,7 @@ void dumpGraphImpl(const char *name, const GraphT &g, const ReportManager &rm) {

 template <typename GraphT>
 void dumpGraphImpl(const char *name, const GraphT &g,
-                   const ue2::unordered_map<NFAVertex, u32> &region_map) {
+                   const unordered_map<NFAVertex, u32> &region_map) {
    typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
    typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
    ofstream os(name);
@@ -331,7 +332,7 @@ void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
 }

 void dumpHolderImpl(const NGHolder &h,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const unordered_map<NFAVertex, u32> &region_map,
                    unsigned int stageNumber, const char *stageName,
                    const Grey &grey) {
    if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
@@ -348,14 +349,7 @@ void dumpSmallWrite(const RoseEngine *rose, const Grey &grey) {
    }

    const struct SmallWriteEngine *smwr = getSmallWrite(rose);
-
-    stringstream ss;
-    ss << grey.dumpPath << "smallwrite.txt";
-
-    FILE *f = fopen(ss.str().c_str(), "w");
-    smwrDumpText(smwr, f);
-    fclose(f);
-
+    smwrDumpText(smwr, StdioFile(grey.dumpPath + "smallwrite.txt", "w"));
    smwrDumpNFA(smwr, false, grey.dumpPath);
 }

@@ -420,9 +414,7 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
        return;
    }

-    stringstream ss;
-    ss << grey.dumpPath << "internal_reports.txt";
-    FILE *f = fopen(ss.str().c_str(), "w");
+    StdioFile f(grey.dumpPath + "internal_reports.txt", "w");
    const vector<Report> &reports = rm.reports();
    for (size_t i = 0; i < reports.size(); i++) {
        const Report &report = reports[i];
@@ -461,7 +453,6 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
        }
        fprintf(f, "\n");
    }
-    fclose(f);
 }

 } // namespace ue2
--- a/src/nfagraph/ng_dump.h
+++ b/src/nfagraph/ng_dump.h
@@ -36,7 +36,8 @@
 #include "grey.h"
 #include "ng_holder.h" // for graph types
 #include "ue2common.h"
-#include "util/ue2_containers.h"
+
+#include <unordered_map>

 #ifdef DUMP_SUPPORT
 #include <fstream>
@@ -75,7 +76,7 @@ void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,

 // Variant that takes a region map as well.
 void dumpHolderImpl(const NGHolder &h,
-                    const ue2::unordered_map<NFAVertex, u32> &region_map,
+                    const std::unordered_map<NFAVertex, u32> &region_map,
                    unsigned int stageNumber, const char *stageName,
                    const Grey &grey);

@@ -123,7 +124,7 @@ void dumpHolder(UNUSED const NGHolder &h, UNUSED unsigned int stageNumber,

 UNUSED static inline
 void dumpHolder(UNUSED const NGHolder &h,
-                UNUSED const ue2::unordered_map<NFAVertex, u32> &region_map,
+                UNUSED const std::unordered_map<NFAVertex, u32> &region_map,
                UNUSED unsigned int stageNumber, UNUSED const char *name,
                UNUSED const Grey &grey) {
 #ifdef DUMP_SUPPORT
--- a/src/nfagraph/ng_edge_redundancy.cpp
+++ b/src/nfagraph/ng_edge_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -38,8 +38,8 @@
 #include "parser/position.h"
 #include "util/compile_context.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/ue2_containers.h"

 #include <set>
 #include <vector>
@@ -181,6 +181,28 @@ bool removeEdgeRedundancyNearCyclesFwd(NGHolder &g, bool ignore_starts) {
    return dead_count;
 }

+static
+bool checkReportsRev(const NGHolder &g, NFAVertex v,
+                     const set<NFAVertex> &happy) {
+    if (g[v].reports.empty()) {
+        return true;
+    }
+
+    assert(edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second);
+
+    /* an edge to accept takes priority over eod only accept */
+    NFAVertex accept = edge(v, g.accept, g).second ? g.accept : g.acceptEod;
+
+    flat_set<ReportID> happy_reports;
+    for (NFAVertex u : happy) {
+        if (edge(u, accept, g).second) {
+            insert(&happy_reports, g[u].reports);
+        }
+    }
+
+    return is_subset_of(g[v].reports, happy_reports);
+}
+
 /** \brief Redundant self-loop removal (reverse version).
 *
 * A self loop on a vertex v can be removed if:
@@ -233,7 +255,8 @@ bool removeEdgeRedundancyNearCyclesRev(NGHolder &g) {
            happy.insert(u);
        }

-        if (!happy.empty() && checkVerticesRev(g, sad, happy)) {
+        if (!happy.empty() && checkVerticesRev(g, sad, happy)
+            && checkReportsRev(g, v, happy)) {
            dead_count++;
            remove_edge(v, v, g);
        }
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -37,9 +37,10 @@
 #include "ng_holder.h"
 #include "ng_util.h"
 #include "util/compile_context.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"

 #include <algorithm>
 #include <memory>
@@ -121,16 +122,9 @@ public:
               vertex_flags == b.vertex_flags && rs == b.rs;
    }

-    friend size_t hash_value(const ClassInfo &c) {
-        size_t val = 0;
-        boost::hash_combine(val, c.rs);
-        boost::hash_combine(val, c.vertex_flags);
-        boost::hash_combine(val, c.cr);
-        boost::hash_combine(val, c.adjacent_cr);
-        boost::hash_combine(val, c.node_type);
-        boost::hash_combine(val, c.depth.d1);
-        boost::hash_combine(val, c.depth.d2);
-        return val;
+    size_t hash() const {
+        return hash_all(rs, vertex_flags, cr, adjacent_cr, node_type, depth.d1,
+                        depth.d2);
    }

 private:
@@ -319,7 +313,7 @@ vector<VertexInfoSet> partitionGraph(vector<unique_ptr<VertexInfo>> &infos,
    const size_t num_verts = infos.size();

    vector<VertexInfoSet> classes;
-    unordered_map<ClassInfo, unsigned> classinfomap;
+    ue2_unordered_map<ClassInfo, unsigned> classinfomap;

    // assume we will have lots of classes, so we don't waste time resizing
    // these structures.
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 #define NG_EXECUTE_H

 #include "ng_holder.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"

 #include <vector>

--- a/src/nfagraph/ng_fixed_width.cpp
+++ b/src/nfagraph/ng_fixed_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,7 @@ namespace ue2 {

 static
 bool findMask(const NGHolder &g, vector<CharReach> *mask, bool *anchored,
-              ue2::flat_set<ReportID> *reports) {
+              flat_set<ReportID> *reports) {
    DEBUG_PRINTF("looking for a mask pattern\n");
    set<NFAVertex> s_succ;
    insert(&s_succ, adjacent_vertices(g.start, g));
@@ -117,7 +117,7 @@ bool handleFixedWidth(RoseBuild &rose, const NGHolder &g, const Grey &grey) {
        return false;
    }

-    ue2::flat_set<ReportID> reports;
+    flat_set<ReportID> reports;
    bool anchored = false;
    vector<CharReach> mask;

--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -40,10 +40,12 @@
 #include "util/bitfield.h"
 #include "util/container.h"
 #include "util/determinise.h"
+#include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/hash_dynamic_bitset.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
+#include "util/unordered.h"

 #include <algorithm>
 #include <functional>
@@ -236,7 +238,7 @@ public:

 struct Big_Traits {
    using StateSet = dynamic_bitset<>;
-    using StateMap = map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>;

    static StateSet init_states(u32 num) {
        return StateSet(num);
@@ -257,7 +259,7 @@ public:

 struct Graph_Traits {
    using StateSet = bitfield<NFA_STATE_LIMIT>;
-    using StateMap = ue2::unordered_map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t>;

    static StateSet init_states(UNUSED u32 num) {
        assert(num <= NFA_STATE_LIMIT);
@@ -284,8 +286,8 @@ public:

 class Automaton_Haig_Merge {
 public:
-    typedef vector<u16> StateSet;
-    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
+    using StateSet = vector<u16>;
+    using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;

    explicit Automaton_Haig_Merge(const vector<const raw_som_dfa *> &in)
        : nfas(in.begin(), in.end()), dead(in.size()) {
@@ -514,11 +516,11 @@ bool doHaig(const NGHolder &g, som_type som,
            raw_som_dfa *rdfa) {
    u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
                                                     a fight */
-    typedef typename Auto::StateSet StateSet;
+    using StateSet = typename Auto::StateSet;
    vector<StateSet> nfa_state_map;
    Auto n(g, som, triggers, unordered_som);
    try {
-        if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
+        if (!determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
            DEBUG_PRINTF("state limit exceeded\n");
            return false;
        }
@@ -720,15 +722,14 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
        }
    }

-    typedef Automaton_Haig_Merge::StateSet StateSet;
+    using StateSet = Automaton_Haig_Merge::StateSet;
    vector<StateSet> nfa_state_map;
    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
                                              NODE_START,
                                              dfas[0]->stream_som_loc_width);

-    int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
-    if (rv) {
-        DEBUG_PRINTF("%d:state limit (%u) exceeded\n", rv, limit);
+    if (!determinise(n, rdfa->states, limit, &nfa_state_map)) {
+        DEBUG_PRINTF("state limit (%u) exceeded\n", limit);
        return nullptr; /* over state limit */
    }

--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -40,7 +40,7 @@
 #include "ue2common.h"
 #include "nfa/nfa_kind.h"
 #include "util/charreach.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/ue2_graph.h"

 namespace ue2 {
@@ -67,7 +67,7 @@ struct NFAGraphEdgeProps {

    /** \brief For graphs that will be implemented as multi-top engines, this
     * specifies the top events. Only used on edges from the start vertex. */
-    ue2::flat_set<u32> tops;
+    flat_set<u32> tops;

    /** \brief Flags associated with assertions. */
    u32 assert_flags = 0;
--- a/src/nfagraph/ng_is_equal.cpp
+++ b/src/nfagraph/ng_is_equal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -39,13 +39,9 @@
 #include "ng_util.h"
 #include "ue2common.h"
 #include "util/container.h"
+#include "util/flat_containers.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
-#include "util/ue2_containers.h"
-
-#include <set>
-
-#include <boost/functional/hash/hash.hpp>

 using namespace std;

@@ -200,11 +196,11 @@ u64a hash_holder(const NGHolder &g) {
    size_t rv = 0;

    for (auto v : vertices_range(g)) {
-        boost::hash_combine(rv, g[v].index);
-        boost::hash_combine(rv, g[v].char_reach);
+        hash_combine(rv, g[v].index);
+        hash_combine(rv, g[v].char_reach);

        for (auto w : adjacent_vertices_range(v, g)) {
-            boost::hash_combine(rv, g[w].index);
+            hash_combine(rv, g[w].index);
        }
    }

--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -346,24 +346,4 @@ bytecode_ptr<NFA> constructLBR(const NGHolder &g,
    return constructLBR(proto, triggers, cc, rm);
 }

-/** \brief True if graph \p g could be turned into an LBR engine. */
-bool isLBR(const NGHolder &g, const Grey &grey) {
-    if (!grey.allowLbr) {
-        return false;
-    }
-
-    PureRepeat repeat;
-    if (!isPureRepeat(g, repeat)) {
-        DEBUG_PRINTF("not pure bounded repeat\n");
-        return false;
-    }
-
-    if (repeat.reports.size() != 1) {
-        DEBUG_PRINTF("too many reports\n");
-        return false;
-    }
-
-    return true;
-}
-
 } // namespace ue2
--- a/src/nfagraph/ng_lbr.h
+++ b/src/nfagraph/ng_lbr.h
@@ -66,9 +66,6 @@ constructLBR(const CastleProto &proto,
             const std::vector<std::vector<CharReach>> &triggers,
             const CompileContext &cc, const ReportManager &rm);

-/** \brief True if graph \p g could be turned into an LBR engine. */
-bool isLBR(const NGHolder &g, const Grey &grey);
-
 } // namespace ue2

 #endif // NG_LBR_H
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -53,11 +53,13 @@
 #include "util/container.h"
 #include "util/graph_range.h"
 #include "util/report_manager.h"
-#include "util/ue2_containers.h"
+#include "util/flat_containers.h"
 #include "util/verify_types.h"

 #include <algorithm>
 #include <map>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include <boost/range/adaptor/map.hpp>
@@ -73,8 +75,8 @@ namespace ue2 {
 // Only used in assertions.
 static
 bool sanityCheckGraph(const NGHolder &g,
-                      const ue2::unordered_map<NFAVertex, u32> &state_ids) {
-    ue2::unordered_set<u32> seen_states;
+                      const unordered_map<NFAVertex, u32> &state_ids) {
+    unordered_set<u32> seen_states;

    for (auto v : vertices_range(g)) {
        // Non-specials should have non-empty reachability.
@@ -115,10 +117,9 @@ bool sanityCheckGraph(const NGHolder &g,
 #endif

 static
-void findSquashStates(const NGHolder &g,
-                      const vector<BoundedRepeatData> &repeats,
-                      map<NFAVertex, NFAStateSet> &squashMap) {
-    squashMap = findSquashers(g);
+unordered_map<NFAVertex, NFAStateSet> findSquashStates(const NGHolder &g,
+                                    const vector<BoundedRepeatData> &repeats) {
+    auto squashMap = findSquashers(g);
    filterSquashers(g, squashMap);

    /* We also filter out the cyclic states representing bounded repeats, as
@@ -128,6 +129,8 @@ void findSquashStates(const NGHolder &g,
            squashMap.erase(br.cyclic);
        }
    }
+
+    return squashMap;
 }

 /**
@@ -468,7 +471,7 @@ void makeTopStates(NGHolder &g, map<u32, set<NFAVertex>> &tops_out,
 static
 set<NFAVertex> findZombies(const NGHolder &h,
            const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids,
+            const unordered_map<NFAVertex, u32> &state_ids,
            const CompileContext &cc) {
    set<NFAVertex> zombies;
    if (!cc.grey.allowZombies) {
@@ -516,7 +519,7 @@ set<NFAVertex> findZombies(const NGHolder &h,
 }

 static
-void reverseStateOrdering(ue2::unordered_map<NFAVertex, u32> &state_ids) {
+void reverseStateOrdering(unordered_map<NFAVertex, u32> &state_ids) {
    vector<NFAVertex> ordering;
    for (auto &e : state_ids) {
        if (e.second == NO_STATE) {
@@ -569,7 +572,7 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
             const map<u32, u32> &fixed_depth_tops,
             const map<u32, vector<vector<CharReach>>> &triggers,
             bool impl_test_only, const CompileContext &cc,
-             ue2::unordered_map<NFAVertex, u32> &state_ids,
+             unordered_map<NFAVertex, u32> &state_ids,
             vector<BoundedRepeatData> &repeats,
             map<u32, set<NFAVertex>> &tops) {
    assert(is_triggered(h_in) || fixed_depth_tops.empty());
@@ -637,7 +640,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
        assert(rm);
    }

-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
    vector<BoundedRepeatData> repeats;
    map<u32, set<NFAVertex>> tops;
    unique_ptr<NGHolder> h
@@ -657,12 +660,12 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
        br_cyclic[br.cyclic] = BoundedRepeatSummary(br.repeatMin, br.repeatMax);
    }

-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;

    // build map of squashed and squashers
    if (cc.grey.squashNFA) {
-        findSquashStates(*h, repeats, squashMap);
+        squashMap = findSquashStates(*h, repeats);

        if (rm && cc.grey.highlanderSquash) {
            reportSquashMap = findHighlanderSquashers(*h, *rm);
@@ -734,8 +737,8 @@ bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
    map<u32, set<NFAVertex>> tops; /* only the standards tops for nfas */
    set<NFAVertex> zombies;
    vector<BoundedRepeatData> repeats;
-    map<NFAVertex, NFAStateSet> reportSquashMap;
-    map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;

    return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
                    zombies, false, false, hint, cc);
@@ -785,7 +788,7 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
     * resultant NGHolder has <= NFA_MAX_STATES. If it does, we know we can
     * implement it as an NFA. */

-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
    vector<BoundedRepeatData> repeats;
    map<u32, set<NFAVertex>> tops;
    unique_ptr<NGHolder> h
@@ -832,7 +835,7 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
    const map<u32, u32> fixed_depth_tops; // empty
    const map<u32, vector<vector<CharReach>>> triggers; // empty

-    ue2::unordered_map<NFAVertex, u32> state_ids;
+    unordered_map<NFAVertex, u32> state_ids;
    vector<BoundedRepeatData> repeats;
    map<u32, set<NFAVertex>> tops;
    unique_ptr<NGHolder> h
@@ -848,8 +851,8 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,

    // Should have no bearing on accel calculation, so we leave these empty.
    const set<NFAVertex> zombies;
-    const map<NFAVertex, NFAStateSet> reportSquashMap;
-    const map<NFAVertex, NFAStateSet> squashMap;
+    unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
+    unordered_map<NFAVertex, NFAStateSet> squashMap;

    return countAccelStates(*h, state_ids, repeats, reportSquashMap, squashMap,
                            tops, zombies, cc);
--- a/Show More
+++ b/Show More