Merge branch develop into master

2025-06-28 16:41:01 +03:00 · 2015-12-18 14:41:50 +11:00 · 2015-12-18 14:41:50 +11:00 · 0e5c4cbd1d
commit 0e5c4cbd1d
parent fe31630221 a5944067d4
72 changed files with 1021 additions and 872 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,36 @@
+# Hyperscan Change Log
+
+This is a list of notable changes to Hyperscan, in reverse chronological order.
+
+## [4.1.0] 2015-12-18
+- Update version of PCRE used by testing tools as a syntax and semantic
+  reference to PCRE 8.38.
+- Small updates to fix warnings identified by Coverity.
+- Clean up and unify exception handling behaviour across GPR and SIMD NFA
+  models.
+- Fix bug in handling of bounded repeat triggers with large gaps between them
+  for sparse repeat model.
+- Correctly reject POSIX collating elements (`[.ch.]`, `[=ch=]`) in the parser.
+  These are not supported by Hyperscan.
+- Add support for quoted sequences (`\Q...\E`) inside character classes.
+- Simplify FDR literal matcher runtime by removing some static specialization.
+- Fix handling of the POSIX `[:graph:]`, `[:print:]` and `[:punct:]` character
+  classes to match the behaviour of PCRE 8.38 in both standard operation and
+  with the UCP flag set. (Note: some bugs were fixed in this area in PCRE
+  8.38.) Previously Hyperscan's behaviour was the same as versions of PCRE
+  before 8.34.
+- Improve performance when compiling pattern sets that include a large number
+  of similar bounded repeat constructs. (github issue #9)
+
+## [4.0.1] 2015-10-30
+- Minor cleanups to test code.
+- CMake and other build system improvements.
+- API update: allow `hs_reset_stream()` and `hs_reset_and_copy_stream()` to be
+  supplied with a NULL scratch pointer if no matches are required. This is in
+  line with the behaviour of `hs_close_stream()`.
+- Disallow bounded repeats with a very large minimum repeat but no maximum,
+  i.e. {N,} for very large N.
+- Reduce compile memory usage in literal set explansion for some large cases.
+
+## [4.0.0] 2015-10-20
+- Original release of Hyperscan as open-source software.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 2.8.11)
 project (Hyperscan C CXX)

 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 0)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 1)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")

-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 INCLUDE (CheckFunctionExists)
@ -56,8 +56,9 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
    set(XCODE TRUE)
 endif()

-include_directories(src .)
-include_directories(${CMAKE_BINARY_DIR})
+set(CMAKE_INCLUDE_CURRENT_DIR 1)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)

 set(BOOST_USE_STATIC_LIBS OFF)
@ -71,7 +72,7 @@ find_package(Boost ${BOOST_MINVERSION})
 if(NOT Boost_FOUND)
    # we might have boost in tree, so provide a hint and try again
    message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${CMAKE_SOURCE_DIR}/include")
+    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
    find_package(Boost ${BOOST_MINVERSION})
    if(NOT Boost_FOUND)
        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
@ -219,6 +220,15 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)

+if (RELEASE_BUILD)
+    if (HAS_C_HIDDEN)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
+    endif()
+    if (HAS_CXX_HIDDEN)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
+    endif()
+endif()
+
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@ -327,8 +337,8 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
 endif()

 # do substitutions
-configure_file(${CMAKE_MODULE_PATH}/config.h.in ${CMAKE_BINARY_DIR}/config.h)
-configure_file(src/hs_version.h.in hs_version.h)
+configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
+configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)

 if (PKG_CONFIG_FOUND)
    # we really only need to do this if we have pkg-config
@ -345,7 +355,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 # include the autogen targets
 add_subdirectory(src/fdr)

-include_directories(${CMAKE_BINARY_DIR}/src/fdr)
+include_directories(${PROJECT_BINARY_DIR}/src/fdr)

 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
--- a/README.md
+++ b/README.md
@ -20,3 +20,24 @@ the [Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/)
 Hyperscan is licensed under the BSD License. See the LICENSE file in the
 project repository.

+# Versioning
+
+The `master` branch on Github will always contain the most recent release of
+Hyperscan. Each version released to `master` goes through QA and testing before
+it is released; if you're a user, rather than a developer, this is the version
+you should be using.
+
+Further development towards the next release takes place on the `develop`
+branch.
+
+# Get Involved
+
+The official homepage for Hyperscan is at [01.org/hyperscan](https://01.org/hyperscan).
+
+If you have questions or comments, we encourage you to [join the mailing
+list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
+sending email to the list, or by creating an issue on Github.
+
+If you wish to contact the Hyperscan team at Intel directly, without posting
+publicly to the mailing list, send email to
+[hyperscan@intel.com](mailto:hyperscan@intel.com).
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -63,6 +63,9 @@ described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.

+The version of PCRE used to validate Hyperscan's interpretation of this syntax
+is 8.38.
+
 ====================
 Supported Constructs
 ====================
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@ -109,7 +109,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
     * limit the size of our buffer appropriately. */
    if ((unsigned long)dataLen > UINT_MAX) {
        dataLen = UINT_MAX;
-        printf("WARNING: clipping data to %lu bytes\n", dataLen);
+        printf("WARNING: clipping data to %ld bytes\n", dataLen);
    } else if (dataLen == 0) {
        fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
        fclose(f);
@ -118,7 +118,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {

    char *inputData = malloc(dataLen);
    if (!inputData) {
-        fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
+        fprintf(stderr, "ERROR: unable to malloc %ld bytes\n", dataLen);
        fclose(f);
        return NULL;
    }
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@ -27,11 +27,11 @@ fdr_autogen(teddy_runtime teddy_autogen.c)
 fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)

 set(fdr_GENERATED_SRC
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
-PARENT_SCOPE)
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+    PARENT_SCOPE)

 set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@ -54,16 +54,11 @@ def produce_fdr_compiles(l):

 def build_fdr_matchers():
    all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
-    big_domains = [ 14, 15 ]
+    strides = [ 1, 2, 4 ]

    common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
-        all_matchers += [ M3(stride = 2, domain = d, **common) ]
-        all_matchers += [ M3(stride = 4, domain = d, **common) ]
-    for d in big_domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+    for s in strides:
+        all_matchers += [ M3(stride = s, **common) ]

    return all_matchers

--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -40,27 +40,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_streaming_runtime.h"
 #include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
-    u32 r = 0;
-    if (a->start_offset == 0) {
-        if (numBits <= 8) {
-            r = a->buf_history[a->len_history - 1];
-        } else {
-            r = a->buf_history[a->len_history - 1];
-            r |= (a->buf[0] << 8);
-        }
-    } else {
-        if (numBits <= 8) {
-            r = a->buf[a->start_offset - 1];
-        } else {
-            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
-        }
-    }
-    return r & ((1 << numBits) - 1);
-}
-
 #include "fdr_autogen.c"

 #define FAKE_HISTORY_SIZE 16
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@ -74,12 +74,12 @@ class ValueExtractStep(Step):
        dsb = m.datasize_bytes
        modval = offset % dsb

-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
            # Case 1: reading more than one byte over the end of the bulk load

            self.latency = 4
            if sub_load_cautious:
-                code_string = "cautious_forward" 
+                code_string = "cautious_forward"
            else:
                code_string = "normal"
            load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                    temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)


-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
        v_var = self.nv(m.value_extract_type, "v%d" % offset)
        self.val = v_var.gen_initializer_stmt(init_string)

@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                          enable_confirmless = m.stride == 1, do_bailout = False)

 class M3(MatcherBase):
-    def get_hash_safety_parameters(self):
-        h_size = self.single_load_type.size_in_bytes()
-        return (0, h_size - 1)
-
    def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
              self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
              self.arch.target, self.conf_pull_back, self.conf_top_level_split)

    def produce_main_loop(self, switch_variant = False):
@ -192,8 +188,8 @@ class M3(MatcherBase):
        ctxt = CodeGenContext(self)

        if switch_variant:
-            print " ptr -= (iterBytes - dist);"
-            print " { " # need an extra scope around switch variant to stop its globals escaping
+            print "    ptr -= (iterBytes - dist);"
+            print "    { " # need an extra scope around switch variant to stop its globals escaping
        else:
            print "    if (doMainLoop) {"
            print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@ -349,25 +345,30 @@ class M3(MatcherBase):
        shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))

        s = Template("""
-            $TYPENAME s;
-            if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
-                s = *((const $TYPENAME *)ft + tmp);
-                $SHIFT_EXPR;
-            } else {
-                s = *(const $TYPENAME *)&fdr->start;
-            }
+    $TYPENAME s;
+    if (a->len_history) {
+        u32 tmp = 0;
+        if (a->start_offset == 0) {
+            tmp = a->buf_history[a->len_history - 1];
+            tmp |= (a->buf[0] << 8);
+        } else {
+            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+        tmp &= fdr->domainMask;
+        s = *((const $TYPENAME *)ft + tmp);
+        $SHIFT_EXPR;
+    } else {
+        s = *(const $TYPENAME *)&fdr->start;
+    }
 """).substitute(TYPENAME = s_type.get_name(),
                ZERO_EXPR = s_type.zero_expression(),
-                DOMAIN = self.domain,
                SHIFT_EXPR = shift_expr)
        return s

    def produce_code(self):

-        (behind, ahead) = self.get_hash_safety_parameters()
-        loop_read_behind = behind
-        loop_read_ahead = self.loop_bytes + ahead
+        loop_read_behind = 0
+        loop_read_ahead = self.loop_bytes + 1

        # we set up mask and shift stuff for extracting our masks from registers
        #
@ -380,7 +381,7 @@ class M3(MatcherBase):
        ssb = self.state_type.size / 8 # state size in bytes

        # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
            # obscure corner - we don't have the room in the register to
            # do this for all values so we don't. domain==16 is pretty
            # bad anyhow, of course
@ -390,7 +391,6 @@ class M3(MatcherBase):

        shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
        self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
-        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust

        print self.produce_header(visible = False)

@ -398,21 +398,19 @@ class M3(MatcherBase):
        print " Arch: " + self.arch.name,
        print " State type: " + self.state_type.get_name(),
        print " Num buckets: %d" % self.num_buckets,
-        print " Domain: %d" % self.domain,
        print " Stride: %d" % self.stride

        print self.produce_common_declarations()
-        print

-        print "\tconst size_t tabSize = %d;" % self.table_size
-        print """
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
+        print
+        print "    u64a domain_mask = fdr->domainMask;"
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
        print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead

        print """
    while (ptr < buf + len) {
@ -451,9 +449,9 @@ class M3(MatcherBase):
        print self.produce_footer()

    def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)

-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                 arch,
                 table_state_width = None,
                 num_buckets = 8,
@ -474,17 +472,9 @@ class M3(MatcherBase):
        self.table_state_width = state_width
        self.table_state_type = getRequiredType(self.table_state_width)

-        # domain is the number of bits that we draw from our input to
-        # index our 'reach' table
-        if not 8 <= domain <= 16:
-            fail_out("Unsupported domain: %d" % domain)
-        self.domain = domain
-        # this is the load type required for this domain if we want to
+        # this is the load type required for domain [9:15] if we want to
        # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
-
-        # table size
-        self.table_size = 2**domain * table_state_width // 8
+        self.single_load_type = IntegerType(16)

        # stride is the frequency with which we make data-driven
        # accesses to our reach table
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
    ptr += floodControlTmp.second;
    aligned_free(floodControlTmp.first);

+    /*  we are allowing domains 9 to 15 only */
+    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->domain = eng.bits;
+    fdr->schemeWidthByte = eng.schemeWidth / 8;
+    fdr->domainMask = (1 << eng.bits) - 1;
+    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
    if (link.first) {
        fdr->link = verify_u32(ptr - fdr_base);
        memcpy(ptr, link.first, link.second);
@ -245,6 +252,8 @@ void FDRCompiler::assignStringsToBuckets() {
    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;

    u32 ls = verify_u32(lits.size());
+    assert(ls); // Shouldn't be called with no literals.
+
    // make a vector that contains our literals as pointers or u32 LiteralIndex values
    vector<LiteralIndex> vli;
    vli.resize(ls);
@ -292,6 +301,8 @@ void FDRCompiler::assignStringsToBuckets() {
            currentChunk++;
        }
    }
+
+    assert(currentChunk > 0);
    count[currentChunk - 1] = ls - chunkStartID;
    // close off chunks with an empty row
    firstIds[currentChunk] = ls;
@ -383,12 +394,14 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                               const vector<hwlmLiteral> &lits,
                               SuffixPositionInString pos,
                               std::map<u32, ue2::unordered_set<u32> > &m2) {
+    assert(eng.bits < 32);
+
    u32 distance = 0;
    if (eng.bits <= 8) {
        distance = 1;
    } else if (eng.bits <= 16) {
        distance = 2;
-    } else if (eng.bits <= 32) {
+    } else {
        distance = 4;
    }

@ -528,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
        return nullptr;
    }

+    // temporary hack for unit testing
+    if (hint != HINT_INVALID) {
+        des->bits = 9;
+    }
+
    FDRCompiler fc(lits, *des, make_small);
    return fc.build(link);
 }
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
        unique_ptr<FDREngineDescription> des =
            getFdrDescription(fdr->engineID);
        if (des) {
+            fprintf(f, "    domain     %u\n", des->bits);
            fprintf(f, "    stride     %u\n", des->stride);
            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
            fprintf(f, "    width      %u\n", des->schemeWidth);
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                        def.numBuckets, def.confirmPullBackDistance,
                        def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}

 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
    // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
    DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                 desiredStride);

-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
    u32 best_score = 0;

-    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
-        if (!eng.isValidOnTarget(target)) {
-            continue;
-        }
-        if (msl < eng.stride) {
-            continue;
-        }
-
-        u32 score = 100;
-
-        score -= absdiff(desiredStride, eng.stride);
-
-        if (eng.stride <= desiredStride) {
-            score += eng.stride;
-        }
-
-        u32 effLits = vl.size(); /* * desiredStride;*/
-        u32 ideal;
-        if (effLits < eng.getNumBuckets()) {
-            if (eng.stride == 1) {
-                ideal = 8;
-            } else {
-                ideal = 10;
+    for (u32 domain = 9; domain <= 15; domain++) {
+        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+            // to make sure that domains >=14 have stride 1 according to origin
+            if (domain > 13 && engineID > 0) {
+                continue;
+            }
+            FDREngineDescription &eng = allDescs[engineID];
+            if (!eng.isValidOnTarget(target)) {
+                continue;
+            }
+            if (msl < eng.stride) {
+                continue;
            }
-        } else if (effLits < 20) {
-            ideal = 10;
-        } else if (effLits < 100) {
-            ideal = 11;
-        } else if (effLits < 1000) {
-            ideal = 12;
-        } else if (effLits < 10000) {
-            ideal = 13;
-        } else {
-            ideal = 15;
-        }

-        if (ideal != 8 && eng.schemeWidth == 32) {
-            ideal += 1;
-        }
+            u32 score = 100;

-        if (make_small) {
-            ideal -= 2;
-        }
+            score -= absdiff(desiredStride, eng.stride);

-        if (eng.stride > 1) {
-            ideal++;
-        }
+            if (eng.stride <= desiredStride) {
+                score += eng.stride;
+            }

-        DEBUG_PRINTF("effLits %u\n", effLits);
+            u32 effLits = vl.size(); /* * desiredStride;*/
+            u32 ideal;
+            if (effLits < eng.getNumBuckets()) {
+                if (eng.stride == 1) {
+                    ideal = 8;
+                } else {
+                    ideal = 10;
+                }
+            } else if (effLits < 20) {
+                ideal = 10;
+            } else if (effLits < 100) {
+                ideal = 11;
+            } else if (effLits < 1000) {
+                ideal = 12;
+            } else if (effLits < 10000) {
+                ideal = 13;
+            } else {
+                ideal = 15;
+            }

-        if (target.is_atom_class() && !make_small && effLits < 4000) {
-            /* Unless it is a very heavy case, we want to build smaller tables
-             * on lightweight machines due to their small caches. */
-            ideal -= 2;
-        }
+            if (ideal != 8 && eng.schemeWidth == 32) {
+                ideal += 1;
+            }

-        score -= absdiff(ideal, eng.bits);
+            if (make_small) {
+                ideal -= 2;
+            }

-        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
-                     "-> score=%u\n",
-                     eng.getID(), eng.schemeWidth, eng.bits,
-                     eng.getNumBuckets(), eng.stride, score);
+            if (eng.stride > 1) {
+                ideal++;
+            }

-        if (!best || score > best_score) {
-            best = &eng;
-            best_score = score;
+            DEBUG_PRINTF("effLits %u\n", effLits);
+
+            if (target.is_atom_class() && !make_small && effLits < 4000) {
+                /* Unless it is a very heavy case, we want to build smaller tables
+                 * on lightweight machines due to their small caches. */
+                ideal -= 2;
+            }
+
+            score -= absdiff(ideal, domain);
+
+            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                         "-> score=%u\n",
+                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getNumBuckets(), eng.stride, score);
+
+            if (!best || score > best_score) {
+                eng.bits = domain;
+                best = &eng;
+                best_score = score;
+            }
        }
    }

--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@ -43,7 +43,6 @@ struct FDREngineDef {
    u32 schemeWidth;
    u32 numBuckets;
    u32 stride;
-    u32 bits;
    u64a cpu_features;
    u32 confirmPullBackDistance;
    u32 confirmTopLevelSplit;
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@ -76,9 +76,11 @@ struct FDR {
     * structures (spillover strings and hash table) if we're a secondary
     * structure. */
    u32 link;
+    u8 domain; /* dynamic domain info */
+    u8 schemeWidthByte;  /* scheme width in bytes */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
    u32 pad1;
-    u32 pad2;
-    u32 pad3;

    union {
        u32 s_u32;
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@ -58,11 +58,13 @@
 #include <boost/range/adaptor/map.hpp>

 using namespace std;
+using boost::adaptors::map_keys;
 using boost::adaptors::map_values;

 namespace ue2 {

 #define CASTLE_MAX_TOPS 32
+#define CLIQUE_GRAPH_MAX_SIZE 1000

 static
 u32 depth_to_u32(const depth &d) {
@ -106,51 +108,35 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
 }

 static
-size_t literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b) {
+bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
+                    const size_t dist) {
    for (size_t i = 0; i < b.size(); i++) {
+        if (i > dist) {
+            return true;
+        }
        size_t overlap_len = b.size() - i;
        if (overlap_len <= a.size()) {
            if (matches(a.end() - overlap_len, a.end(), b.begin(),
                        b.end() - i)) {
-                return i;
+                return false;
            }
        } else {
            assert(overlap_len > a.size());
            if (matches(a.begin(), a.end(), b.end() - i - a.size(),
                        b.end() - i)) {
-                return i;
+                return false;
            }
        }
    }

-    return b.size();
+    return b.size() > dist;
 }

-//  UE-2666 case 1: The problem of find largest exclusive subcastles group
-//  can be reformulated as finding the largest clique (subgraph where every
-//  vertex is connected to every other vertex) in the graph. We use an
-//  approximate algorithm here to find the maximum clique.
-//  References
-//  ----------
-//      [1] Boppana, R., & Halldórsson, M. M. (1992).
-//      Approximating maximum independent sets by excluding subgraphs.
-//      BIT Numerical Mathematics, 32(2), 180–196. Springer.
-//      doi:10.1007/BF01994876
-//  ----------
-
 struct CliqueVertexProps {
    CliqueVertexProps() {}
    explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}

    u32 stateId = ~0U;
-    u32 parentId = ~0U;
-    bool leftChild = false; /* tells us if it is the left child of its parent */
-    bool rightChildVisited = false; /* tells us if its right child is visited */
-
-    vector<u32> clique1; /* clique for the left branch */
-    vector<u32> indepSet1; /* independent set for the left branch */
-    vector<u32> clique2; /* clique for the right branch */
-    vector<u32> indepSet2; /* independent set for the right branch */
 };

 typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
@ -158,181 +144,54 @@ typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
 typedef CliqueGraph::vertex_descriptor CliqueVertex;

 static
-unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
-    u32 size = exclusiveSet.size();
-
-    vector<CliqueVertex> vertices;
-    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-        vertices.push_back(v);
-    }
-
-    // construct the complement graph, then its maximum independent sets
-    // are equal to the maximum clique of the original graph
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex s = vertices[i];
-        vector<u32> complement(size, 0);
-        for (u32 j = 0; j < exclusiveSet[i].size(); ++j) {
-            u32 val = exclusiveSet[i][j];
-            complement[val] = 1;
-        }
-
-        for (u32 k = i + 1; k < size; ++k) {
-             if (!complement[k]) {
-                CliqueVertex d = vertices[k];
-                add_edge(s, d, *cg);
-             }
-        }
-    }
-    return cg;
-}
-
-static
-CliqueGraph createSubgraph(const CliqueGraph &cg,
-                           const vector<CliqueVertex> &vertices) {
-    CliqueGraph g;
-    map<u32, CliqueVertex> vertexMap;
-    for (auto u : vertices) {
-        u32 id = cg[u].stateId;
-        CliqueVertex v = add_vertex(CliqueVertexProps(id), g);
-        vertexMap[id] = v;
-    }
-
-    set<u32> found;
-    for (auto u : vertices) {
-        u32 srcId = cg[u].stateId;
-        CliqueVertex src = vertexMap[srcId];
-        found.insert(srcId);
-        for (auto n : adjacent_vertices_range(u, cg)) {
-            u32 dstId = cg[n].stateId;
-            if (found.find(dstId) == found.end() &&
-                vertexMap.find(dstId) != vertexMap.end()) {
-                CliqueVertex dst = vertexMap[dstId];
-                add_edge(src, dst, g);
-            }
-        }
-    }
-    return g;
-}
-
-static
-void getNeighborInfo(const CliqueGraph &g, vector<CliqueVertex> &neighbor,
-                     vector<CliqueVertex> &nonneighbor,
-                     const CliqueVertex &cv) {
+void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
+                     const CliqueVertex &cv, const set<u32> &group) {
    u32 id = g[cv].stateId;
    ue2::unordered_set<u32> neighborId;

    // find neighbors for cv
-    for (auto v : adjacent_vertices_range(cv, g)) {
-        neighbor.push_back(v);
-        neighborId.insert(g[v].stateId);
-    }
-
-    // find non-neighbors for cv
-    for (auto v : vertices_range(g)) {
-        if (g[v].stateId != id &&
-            neighborId.find(g[v].stateId) == neighborId.end()) {
-            nonneighbor.push_back(v);
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
+            neighbor.push_back(g[v].stateId);
+            neighborId.insert(g[v].stateId);
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
        }
    }
 }

 static
-void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
-                      vector<u32> &clique, vector<u32> &indepSet) {
-    u32 id = cg[n].stateId;
-    if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) {
-        cg[n].clique1.push_back(id);
-        clique.swap(cg[n].clique1);
-    } else {
-        clique.swap(cg[n].clique2);
-    }
+void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
+    stack<vector<u32>> gStack;

-    if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) {
-        cg[n].indepSet2.push_back(id);
-        indepSet.swap(cg[n].indepSet2);
-    } else {
-        indepSet.swap(cg[n].indepSet1);
-    }
-}
-
-static
-void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique,
-                     vector<u32> &indepSet) {
-    stack<CliqueGraph> gStack;
-    gStack.push(cg);
-
-    // create mapping between vertex and id
+    // Create mapping between vertex and id
    map<u32, CliqueVertex> vertexMap;
-    for (auto v : vertices_range(cg)) {
+    vector<u32> init;
+    for (const auto &v : vertices_range(cg)) {
        vertexMap[cg[v].stateId] = v;
+        init.push_back(cg[v].stateId);
    }
+    gStack.push(init);

-    // get the vertex to start from
-    ue2::unordered_set<u32> foundVertexId;
+    // Get the vertex to start from
    CliqueGraph::vertex_iterator vi, ve;
    tie(vi, ve) = vertices(cg);
-    CliqueVertex start = *vi;
-    u32 startId = cg[start].stateId;
-
-    bool leftChild = false;
-    u32 prevId = startId;
    while (!gStack.empty()) {
-        CliqueGraph g = gStack.top();
+        vector<u32> g = gStack.top();
        gStack.pop();

-        // choose a vertex from the graph
-        tie(vi, ve) = vertices(g);
-        CliqueVertex cv = *vi;
-        u32 id = g[cv].stateId;
-
-        // corresponding vertex in the original graph
-        CliqueVertex n = vertexMap.at(id);
-
-        vector<CliqueVertex> neighbor;
-        vector<CliqueVertex> nonneighbor;
-        getNeighborInfo(g, neighbor, nonneighbor, cv);
-
-        if (foundVertexId.find(id) != foundVertexId.end()) {
-            prevId = id;
-            // get graph consisting of non-neighbors for right branch
-            if (!cg[n].rightChildVisited) {
-                gStack.push(g);
-                if (!nonneighbor.empty()) {
-                    const CliqueGraph &nSub = createSubgraph(g, nonneighbor);
-                    gStack.push(nSub);
-                    leftChild = false;
-                }
-                cg[n].rightChildVisited = true;
-            } else if (id != startId) {
-                // both the left and right branches are visited,
-                // update its parent's clique and independent sets
-                u32 parentId = cg[n].parentId;
-                CliqueVertex parent = vertexMap.at(parentId);
-                if (cg[n].leftChild) {
-                    updateCliqueInfo(cg, n, cg[parent].clique1,
-                        cg[parent].indepSet1);
-                } else {
-                    updateCliqueInfo(cg, n, cg[parent].clique2,
-                        cg[parent].indepSet2);
-                }
-            }
-        } else {
-            foundVertexId.insert(id);
-            g[n].leftChild = leftChild;
-            g[n].parentId = prevId;
-            gStack.push(g);
-            // get graph consisting of neighbors for left branch
-            if (!neighbor.empty()) {
-                const CliqueGraph &sub = createSubgraph(g, neighbor);
-                gStack.push(sub);
-                leftChild = true;
-            }
-            prevId = id;
+        // Choose a vertex from the graph
+        u32 id = g[0];
+        const CliqueVertex &n = vertexMap.at(id);
+        clique.push_back(id);
+        // Corresponding vertex in the original graph
+        vector<u32> neighbor;
+        set<u32> subgraphId(g.begin(), g.end());
+        getNeighborInfo(cg, neighbor, n, subgraphId);
+        // Get graph consisting of neighbors for left branch
+        if (!neighbor.empty()) {
+            gStack.push(neighbor);
        }
    }
-    updateCliqueInfo(cg, start, clique, indepSet);
 }

 template<typename Graph>
@ -345,18 +204,17 @@ bool graph_empty(const Graph &g) {
 static
 vector<u32> removeClique(CliqueGraph &cg) {
    vector<vector<u32>> cliquesVec(1);
-    vector<vector<u32>> indepSetsVec(1);
    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
-    findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]);
+    findCliqueGroup(cg, cliquesVec[0]);
    while (!graph_empty(cg)) {
        const vector<u32> &c = cliquesVec.back();
        vector<CliqueVertex> dead;
-        for (auto v : vertices_range(cg)) {
+        for (const auto &v : vertices_range(cg)) {
            if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
                dead.push_back(v);
            }
        }
-        for (auto v : dead) {
+        for (const auto &v : dead) {
            clear_vertex(v, cg);
            remove_vertex(v, cg);
        }
@ -364,30 +222,22 @@ vector<u32> removeClique(CliqueGraph &cg) {
            break;
        }
        vector<u32> clique;
-        vector<u32> indepSet;
-        findCliqueGroup(cg, clique, indepSet);
+        findCliqueGroup(cg, clique);
        cliquesVec.push_back(clique);
-        indepSetsVec.push_back(indepSet);
    }

    // get the independent set with max size
    size_t max = 0;
    size_t id = 0;
-    for (size_t j = 0; j < indepSetsVec.size(); ++j) {
-        if (indepSetsVec[j].size() > max) {
-            max = indepSetsVec[j].size();
+    for (size_t j = 0; j < cliquesVec.size(); ++j) {
+        if (cliquesVec[j].size() > max) {
+            max = cliquesVec[j].size();
            id = j;
        }
    }

-    DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size());
-    return indepSetsVec[id];
-}
-
-static
-vector<u32> findMaxClique(const vector<vector<u32>> &exclusiveSet) {
-    auto cg = makeCG(exclusiveSet);
-    return removeClique(*cg);
+    DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
+    return cliquesVec[id];
 }

 // if the location of any reset character in one literal are after
@ -401,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2,
    const auto &triggers2 = triggers[id2];
    for (u32 i = 0; i < triggers1.size(); ++i) {
        for (u32 j = 0; j < triggers2.size(); ++j) {
-            size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]);
-            size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]);
-            if (max_overlap1 <= min_reset_dist[id2][j] ||
-                max_overlap2 <= min_reset_dist[id1][i]) {
+            if (!literalOverlap(triggers1[i], triggers2[j],
+                                min_reset_dist[id2][j]) ||
+                !literalOverlap(triggers2[j], triggers1[i],
+                                min_reset_dist[id1][i])) {
                return false;
            }
        }
@ -420,28 +270,33 @@ vector<u32> checkExclusion(const CharReach &cr,
        return group;
    }

-    vector<vector<size_t> > min_reset_dist;
+    vector<vector<size_t>> min_reset_dist;
    // get min reset distance for each repeat
    for (auto it = triggers.begin(); it != triggers.end(); it++) {
        const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr);
        min_reset_dist.push_back(tmp_dist);
    }

-    vector<vector<u32>> exclusiveSet;
+    vector<CliqueVertex> vertices;
+    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+    for (u32 i = 0; i < triggers.size(); ++i) {
+        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
+        vertices.push_back(v);
+    }
+
    // find exclusive pair for each repeat
    for (u32 i = 0; i < triggers.size(); ++i) {
-        vector<u32> repeatIds;
+        CliqueVertex s = vertices[i];
        for (u32 j = i + 1; j < triggers.size(); ++j) {
            if (findExclusivePair(i, j, min_reset_dist, triggers)) {
-                repeatIds.push_back(j);
+                CliqueVertex d = vertices[j];
+                add_edge(s, d, *cg);
            }
        }
-        exclusiveSet.push_back(repeatIds);
-        DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size());
    }

    // find the largest exclusive group
-    return findMaxClique(exclusiveSet);
+    return removeClique(*cg);
 }

 static
@ -599,7 +454,7 @@ buildCastle(const CastleProto &proto,

        repeatInfoPair.push_back(make_pair(min_period, is_reset));

-        if (is_reset) {
+        if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
            candidateTriggers.push_back(triggers.at(top));
            candidateRepeats.push_back(i);
        }
@ -608,7 +463,7 @@ buildCastle(const CastleProto &proto,
    // Case 1: exclusive repeats
    bool exclusive = false;
    bool pureExclusive = false;
-    u8 activeIdxSize = 0;
+    u32 activeIdxSize = 0;
    set<u32> exclusiveGroup;
    if (cc.grey.castleExclusive) {
        vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
@ -617,7 +472,7 @@ buildCastle(const CastleProto &proto,
            // Case 1: mutual exclusive repeats group found, initialize state
            // sizes
            exclusive = true;
-            activeIdxSize = calcPackedBytes(exclusiveSize);
+            activeIdxSize = calcPackedBytes(numRepeats + 1);
            if (exclusiveSize == numRepeats) {
                pureExclusive = true;
                streamStateSize = 0;
@ -665,7 +520,7 @@ buildCastle(const CastleProto &proto,
    c->numRepeats = verify_u32(subs.size());
    c->exclusive = exclusive;
    c->pureExclusive = pureExclusive;
-    c->activeIdxSize = activeIdxSize;
+    c->activeIdxSize = verify_u8(activeIdxSize);

    writeCastleScanEngine(cr, c);

@ -710,8 +565,8 @@ buildCastle(const CastleProto &proto,

 set<ReportID> all_reports(const CastleProto &proto) {
    set<ReportID> reports;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        reports.insert(pr.reports.begin(), pr.reports.end());
+    for (const ReportID &report : proto.report_map | map_keys) {
+        reports.insert(report);
    }
    return reports;
 }
@ -732,10 +587,30 @@ depth findMaxWidth(const CastleProto &proto) {
    return max_width;
 }

+depth findMinWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth::infinity();
+    }
+    return proto.repeats.at(top).bounds.min;
+}
+
+depth findMaxWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth(0);
+    }
+    return proto.repeats.at(top).bounds.max;
+}
+
 CastleProto::CastleProto(const PureRepeat &pr) {
    assert(pr.reach.any());
    assert(pr.reports.size() == 1);
-    repeats.insert(make_pair(0, pr));
+    u32 top = 0;
+    repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
 }

 const CharReach &CastleProto::reach() const {
@ -743,25 +618,29 @@ const CharReach &CastleProto::reach() const {
    return repeats.begin()->second.reach;
 }

-static
-u32 find_next_top(const map<u32, PureRepeat> &repeats) {
-    u32 top = 0;
-    for (; contains(repeats, top); top++) {
-        // pass
-    }
-    return top;
-}
-
 u32 CastleProto::add(const PureRepeat &pr) {
    assert(repeats.size() < max_occupancy);
    assert(pr.reach == reach());
    assert(pr.reports.size() == 1);
-    u32 top = find_next_top(repeats);
+    u32 top = next_top++;
    DEBUG_PRINTF("selected unused top %u\n", top);
-    repeats.insert(make_pair(top, pr));
+    assert(!contains(repeats, top));
+    repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
    return top;
 }

+void CastleProto::erase(u32 top) {
+    DEBUG_PRINTF("erase top %u\n", top);
+    assert(contains(repeats, top));
+    repeats.erase(top);
+    for (auto &m : report_map) {
+        m.second.erase(top);
+    }
+}
+
 u32 CastleProto::merge(const PureRepeat &pr) {
    assert(repeats.size() <= max_occupancy);
    assert(pr.reach == reach());
@ -806,8 +685,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = find_next_top(c1.repeats);
-        c1.repeats.insert(make_pair(new_top, pr));
+        u32 new_top = c1.add(pr);
        top_map[top] = new_top;
        DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
    }
@ -823,12 +701,23 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
    for (const auto &m : proto.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
-        u32 new_top = find_next_top(out);
-        out.insert(make_pair(new_top, pr));
+        u32 new_top = out.size();
+        out.emplace(new_top, pr);
        top_map[top] = new_top;
    }

    proto.repeats.swap(out);
+
+    // Remap report map.
+    proto.report_map.clear();
+    for (const auto &m : proto.repeats) {
+        const u32 top = m.first;
+        const PureRepeat &pr = m.second;
+        for (const auto &report : pr.reports) {
+            proto.report_map[report].insert(top);
+        }
+    }
+
    assert(proto.repeats.size() <= proto.max_occupancy);
 }

@ -904,18 +793,17 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
    return c1.repeats == c2.repeats;
 }

-bool requiresDedupe(const CastleProto &proto, const set<ReportID> &reports) {
-    ue2::unordered_set<ReportID> seen;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        for (const ReportID &report : pr.reports) {
-            if (contains(reports, report)) {
-                if (contains(seen, report)) {
-                    DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
-                                 report);
-                    return true;
-                }
-                seen.insert(report);
-            }
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports) {
+    for (const auto &report : reports) {
+        auto it = proto.report_map.find(report);
+        if (it == end(proto.report_map)) {
+            continue;
+        }
+        if (it->second.size() > 1) {
+            DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
+                         report);
+            return true;
        }
    }
    return false;
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@ -38,6 +38,7 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/alloc.h"
 #include "util/depth.h"
+#include "util/ue2_containers.h"

 #include <map>
 #include <memory>
@ -67,8 +68,12 @@ struct CastleProto {
    explicit CastleProto(const PureRepeat &pr);
    const CharReach &reach() const;

+    /** \brief Add a new repeat. */
    u32 add(const PureRepeat &pr);

+    /** \brief Remove a repeat. */
+    void erase(u32 top);
+
    /**
     * \brief Merge in the given repeat, returning the top used.
     *
@ -80,11 +85,22 @@ struct CastleProto {

    /** \brief Mapping from unique top id to repeat. */
    std::map<u32, PureRepeat> repeats;
+
+    /** \brief Mapping from report to associated tops. */
+    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
+
+    /**
+     * \brief Next top id to use. Repeats may be removed without top remapping,
+     * so we track this explicitly instead of using repeats.size().
+     */
+    u32 next_top = 1;
 };

 std::set<ReportID> all_reports(const CastleProto &proto);
 depth findMinWidth(const CastleProto &proto);
 depth findMaxWidth(const CastleProto &proto);
+depth findMinWidth(const CastleProto &proto, u32 top);
+depth findMaxWidth(const CastleProto &proto, u32 top);

 /**
 * \brief Remap tops to be contiguous.
@ -133,7 +149,8 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
 * \brief True if the given castle contains more than a single instance of any
 * of the reports in the given set.
 */
-bool requiresDedupe(const CastleProto &proto, const std::set<ReportID> &reports);
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports);

 /**
 * \brief Build an NGHolder from a CastleProto.
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@ -1109,7 +1109,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    u32 total_prog_size = byte_length(temp_blocks);
    curr_offset += total_prog_size;

-    gi.stream_som_loc_count =  slot_count;
+    gi.stream_som_loc_count = slot_count;
    gi.stream_som_loc_width = somPrecision;

    u32 gough_size = ROUNDUP_N(curr_offset, 16);
@ -1136,16 +1136,11 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    gough_dfa->length = gough_size;

    /* copy in blocks */
-    memcpy((u8 *)gough_dfa.get() + edge_prog_offset, &edge_blocks[0],
-           byte_length(edge_blocks));
+    copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
    if (top_prog_offset) {
-        memcpy((u8 *)gough_dfa.get() + top_prog_offset, &top_blocks[0],
-               byte_length(top_blocks));
-    }
-    if (!temp_blocks.empty()) {
-        memcpy((u8 *)gough_dfa.get() + prog_base_offset, &temp_blocks[0],
-               byte_length(temp_blocks));
+        copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
    }
+    copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);

    return gough_dfa;
 }
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@ -70,8 +70,11 @@ struct dstate_som {
 };

 struct raw_som_dfa : public raw_dfa {
-    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in)
-        : raw_dfa(k), unordered_som_triggers(unordered_som_triggers_in) {
+    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in, u32 trigger,
+                u32 stream_som_loc_width_in)
+        : raw_dfa(k), stream_som_loc_width(stream_som_loc_width_in),
+        unordered_som_triggers(unordered_som_triggers_in),
+        trigger_nfa_state(trigger) {
        assert(!unordered_som_triggers || is_triggered(kind));
    }

--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -1397,8 +1397,7 @@ struct Factory {
            repeat->horizon = rsi.horizon;
            repeat->packedCtrlSize = rsi.packedCtrlSize;
            repeat->stateSize = rsi.stateSize;
-            memcpy(repeat->packedFieldSizes, rsi.packedFieldSizes.data(),
-                   byte_length(rsi.packedFieldSizes));
+            copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
            repeat->patchCount = rsi.patchCount;
            repeat->patchSize = rsi.patchSize;
            repeat->encodingSize = rsi.encodingSize;
@ -1413,8 +1412,7 @@ struct Factory {
            // Copy in the sparse lookup table.
            if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
                assert(!rsi.table.empty());
-                memcpy(info_ptr + tableOffset, rsi.table.data(),
-                       byte_length(rsi.table));
+                copy_bytes(info_ptr + tableOffset, rsi.table);
            }

            // Fill the tug mask.
@ -1702,6 +1700,7 @@ struct Factory {

        for (u32 i = 0; i < num_repeats; i++) {
            repeatOffsets[i] = offset;
+            assert(repeats[i].first);
            memcpy((char *)limex + offset, repeats[i].first.get(),
                   repeats[i].second);
            offset += repeats[i].second;
@ -1709,8 +1708,7 @@ struct Factory {

        // Write repeat offset lookup table.
        assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
-        memcpy((char *)limex + repeatOffsetsOffset, repeatOffsets.data(),
-               byte_length(repeatOffsets));
+        copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);

        limex->repeatOffset = repeatOffsetsOffset;
        limex->repeatCount = num_repeats;
@ -1725,8 +1723,7 @@ struct Factory {
        limex->exReportOffset = exceptionReportsOffset;
        assert(ISALIGNED_N((char *)limex + exceptionReportsOffset,
                           alignof(ReportID)));
-        memcpy((char *)limex + exceptionReportsOffset, reports.data(),
-               byte_length(reports));
+        copy_bytes((char *)limex + exceptionReportsOffset, reports);
    }

    static
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@ -317,7 +317,7 @@ template<typename limex_type>
 struct limex_labeller : public nfa_labeller {
    explicit limex_labeller(const limex_type *limex_in) : limex(limex_in) {}

-    void label_state(FILE *f, u32 state) const {
+    void label_state(FILE *f, u32 state) const override {
        const typename limex_traits<limex_type>::exception_type *exceptions
            = getExceptionTable(limex);
        if (!testbit((const u8 *)&limex->exceptionMask,
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@ -218,7 +218,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
    if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
        DEBUG_PRINTF("using cached succ from previous state\n");
        STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), LOAD_STATE(&ctx->cached_esucc)));
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                         ctx->context, offset)
                        == MO_HALT_MATCHING)) {
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@ -83,7 +83,8 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
    if (estate == ctx->cached_estate) {
        DEBUG_PRINTF("using cached succ from previous state\n");
        *succ |= ctx->cached_esucc;
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                         ctx->context, offset)
                        == MO_HALT_MATCHING)) {
@ -119,7 +120,9 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
        ctx->cached_reports = new_cache.reports;
        ctx->cached_br = new_cache.br;
    } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
-        ctx->cached_estate = 0U;
+        if (ctx->cached_br) {
+            ctx->cached_estate = 0U;
+        }
    }

    return 0;
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@ -179,7 +179,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    assert(ISALIGNED_CL(ctx));
    assert(ISALIGNED_CL(&ctx->s));
    STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */

    /* assert(ISALIGNED_16(exceptions)); */
    /* assert(ISALIGNED_16(reach)); */
@ -305,7 +304,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    const ReportID *exReports = getExReports(limex);
    const u32 *exceptionMap = limex->exceptionMap;
    STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */

    /* assert(ISALIGNED_16(exceptions)); */
    /* assert(ISALIGNED_16(reach)); */
@ -542,7 +540,6 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
    ctx->callback = q->cb;
    ctx->context = q->context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);

    assert(q->items[q->cur].location >= 0);
    DEBUG_PRINTF("LOAD STATE\n");
@ -638,7 +635,6 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
    ctx->callback = q->cb;
    ctx->context = q->context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);

    DEBUG_PRINTF("LOAD STATE\n");
    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@ -730,7 +726,6 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
    ctx->callback = NULL;
    ctx->context = NULL;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);

    DEBUG_PRINTF("LOAD STATE\n");
    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@ -833,7 +828,6 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
    ctx->callback = cb;
    ctx->context = context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);

    const IMPL_NFA_T *limex = getImplNfa(n);
    STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@ -700,7 +700,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    ReportID arb;
    u8 single;
    u32 accelCount;
+
    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
    u16 count_real_states;
    if (allocateFSN16(info, &count_real_states)) {
        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
@ -843,6 +846,7 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
                       const vector<u32> &reports_eod, u32 i) {
    dstate_id_t j = info.implId(i);
    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);

    for (size_t s = 0; s < info.impl_alpha_size; s++) {
        dstate_id_t raw_succ = info.states[i].next[s];
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@ -70,9 +70,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
        break;
    case MPV_VERM:
        if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "verm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "verm 0x%02x\n", k->u.verm.c);
        } else {
-            fprintf(f, "verm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "verm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
        }
        break;
    case MPV_SHUFTI:
@ -87,9 +87,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
        break;
    case MPV_NVERM:
        if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "nverm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "nverm 0x%02x\n", k->u.verm.c);
        } else {
-            fprintf(f, "nverm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "nverm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
        }
        break;
    default:
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@ -196,6 +196,14 @@ static really_inline s64a q_cur_loc(const struct mq *q) {
    return q->items[q->cur].location;
 }

+/** \brief Returns the type of the last event in the queue. */
+static really_inline u32 q_last_type(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->end > 0);
+    assert(q->end <= MAX_MQE_LEN);
+    return q->items[q->end - 1].type;
+}
+
 /** \brief Returns the location (relative to the beginning of the current data
 * buffer) of the last event in the queue. */
 static really_inline s64a q_last_loc(const struct mq *q) {
@ -269,7 +277,7 @@ void debugQueue(const struct mq *q) {
            type = "MQE_TOP_N";
            break;
        }
-        DEBUG_PRINTF("\tq[%u] %lld %d:%s\n", cur, q->items[cur].location,
+        DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location,
                     q->items[cur].type, type);
    }
 }
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@ -39,6 +39,8 @@
 #include "util/pack_bits.h"
 #include "util/partial_store.h"
 #include "util/unaligned.h"
+
+#include <stdint.h>
 #include <string.h>

 /** \brief Returns the total capacity of the ring.
@ -709,12 +711,7 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
    dumpRing(info, xs, ring);
 #endif

-    // We work in terms of the distance between the current offset and the base
-    // offset in our history.
-    u64a delta = offset - xs->offset;
-    DEBUG_PRINTF("delta=%llu\n", delta);
-
-    if (delta < info->repeatMin) {
+    if (offset - xs->offset < info->repeatMin) {
        DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
        return REPEAT_NOMATCH;
    }
@ -724,17 +721,22 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
        return REPEAT_STALE;
    }

+    // If we're not stale, delta fits in the range [repeatMin, lastTop +
+    // repeatMax], which fits in a u32.
+    assert(offset - xs->offset < UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    DEBUG_PRINTF("delta=%u\n", delta);
+
    // Find the bounds on possible matches in the ring buffer.
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin + 1;
-    upper = MIN(upper, ringOccupancy(xs, ringSize));
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize));

    if (lower >= upper) {
        DEBUG_PRINTF("no matches to check\n");
        return REPEAT_NOMATCH;
    }

-    DEBUG_PRINTF("possible match indices=[%llu,%llu]\n", lower, upper);
+    DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper);
    if (ringHasMatch(xs, ring, ringSize, lower, upper)) {
        return REPEAT_MATCH;
    }
@ -1163,7 +1165,7 @@ static
 void storeInitialRingTopPatch(const struct RepeatInfo *info,
                              struct RepeatRingControl *xs,
                              u8 *state, u64a offset) {
-    DEBUG_PRINTF("set the first patch\n");
+    DEBUG_PRINTF("set the first patch, offset=%llu\n", offset);
    xs->offset = offset;

    u8 *active = state;
@ -1197,12 +1199,10 @@ u32 getSparseOptimalTargetValue(const struct RepeatInfo *info,
    return loc;
 }

-u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
-                                 const union RepeatControl *ctrl,
-                                 const void *state) {
+static
+u64a sparseLastTop(const struct RepeatInfo *info,
+                   const struct RepeatRingControl *xs, const u8 *state) {
    DEBUG_PRINTF("looking for last top\n");
-    const struct RepeatRingControl *xs = &ctrl->ring;
-
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 encoding_size = info->encodingSize;
@ -1214,7 +1214,7 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
    }

    DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ);
-    const u8 *ring = (const u8 *)state + info->patchesOffset;
+    const u8 *ring = state + info->patchesOffset;
    u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size);

    DEBUG_PRINTF("val:%llu\n", val);
@ -1231,6 +1231,12 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
    return 0;
 }

+u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+                                 const union RepeatControl *ctrl,
+                                 const void *state) {
+    return sparseLastTop(info, &ctrl->ring, state);
+}
+
 u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
                                   const union RepeatControl *ctrl,
                                   const void *state, u64a offset) {
@ -1249,20 +1255,20 @@ u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
    if (nextOffset <= xs->offset + info->repeatMin) {
        patch = xs->first;
        tval = 0;
-    } else if (nextOffset >
-               repeatLastTopSparseOptimalP(info, ctrl, state) +
-               info->repeatMax) {
+    } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) {
+        DEBUG_PRINTF("ring is stale\n");
        return 0;
    } else {
-        u64a delta = nextOffset - xs->offset;
-        u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+        assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale
+        u32 delta = (u32)(nextOffset - xs->offset);
+        u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
        patch = lower / patch_size;
        tval = lower - patch * patch_size;
    }

    DEBUG_PRINTF("patch %u\n", patch);
    u32 patch_count = info->patchCount;
-    if (patch >= patch_count){
+    if (patch >= patch_count) {
        return 0;
    }

@ -1336,21 +1342,32 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                               union RepeatControl *ctrl, void *state,
                               u64a offset, char is_alive) {
    struct RepeatRingControl *xs = &ctrl->ring;
-
-    u64a delta = offset - xs->offset;
-    u32 patch_size = info->patchSize;
-    u32 patch_count = info->patchCount;
-    u32 encoding_size = info->encodingSize;
-    u32 patch = delta / patch_size;
-    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset, encoding_size);
-
    u8 *active = (u8 *)state;
-    if (!is_alive) {
+
+    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset,
+                 info->encodingSize);
+
+    // If (a) this is the first top, or (b) the ring is stale, initialize the
+    // ring and write this offset in as the first top.
+    if (!is_alive ||
+        offset > sparseLastTop(info, xs, state) + info->repeatMax) {
        storeInitialRingTopPatch(info, xs, active, offset);
        return;
    }

-    assert(offset >= xs->offset);
+    // Tops should arrive in order, with no duplicates.
+    assert(offset > sparseLastTop(info, xs, state));
+
+    // As the ring is not stale, our delta should fit within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+    u32 patch = delta / patch_size;
+
+    DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size,
+                 patch);

    u8 *ring = active + info->patchesOffset;
    u32 occ = ringOccupancy(xs, patch_count);
@ -1361,10 +1378,6 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                 patch, patch_count, occ);
    if (patch >= patch_count) {
        u32 patch_shift_count = patch - patch_count + 1;
-        if (patch_shift_count >= patch_count) {
-            storeInitialRingTopPatch(info, xs, active, offset);
-            return;
-        }
        assert(patch >= patch_shift_count);
        DEBUG_PRINTF("shifting by %u\n", patch_shift_count);
        xs->offset += patch_size * patch_shift_count;
@ -1401,7 +1414,8 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
        }
    }

-    u64a diff = delta - patch * patch_size;
+    assert((u64a)patch * patch_size <= delta);
+    u32 diff = delta - patch * patch_size;
    const u64a *repeatTable = getImplTable(info);
    val += repeatTable[diff];

@ -1480,7 +1494,7 @@ char sparseHasMatch(const struct RepeatInfo *info, const u8 *state,
 enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
                                              const union RepeatControl *ctrl,
                                              const void *state, u64a offset) {
-    DEBUG_PRINTF("check for match at %llu corresponding to trigger"
+    DEBUG_PRINTF("check for match at %llu corresponding to trigger "
                 "at [%llu, %llu]\n", offset, offset - info->repeatMax,
                 offset - info->repeatMin);

@ -1492,21 +1506,25 @@ enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
    if (offset < xs->offset + info->repeatMin) {
        DEBUG_PRINTF("too soon\n");
        return REPEAT_NOMATCH;
-    } else if (offset > repeatLastTopSparseOptimalP(info, ctrl, state) +
-                        info->repeatMax) {
+    } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) {
        DEBUG_PRINTF("stale\n");
        return REPEAT_STALE;
    }

-    u64a delta = offset - xs->offset;
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin;
+    // Our delta between the base offset of the ring and the current offset
+    // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This
+    // range fits comfortably within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+
+    u32 delta = (u32)(offset - xs->offset);
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 occ = ringOccupancy(xs, patch_count);
-    upper = MIN(upper, occ * patch_size - 1);

-    DEBUG_PRINTF("lower=%llu, upper=%llu\n", lower, upper);
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1);
+
+    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
    u32 patch_lower = lower / patch_size;
    u32 patch_upper = upper / patch_size;

--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@ -75,8 +75,8 @@ u32 calcPackedBytes(u64a val) {
 }

 static
-u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
-                      const u32 minPeriod) {
+u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
+                     const u32 minPeriod) {
    u32 repeatTmp = info->patchCount > 2 ? 64 : (u32)repeatMax;
    u32 repeat_index = repeatTmp < minPeriod ? repeatTmp : minPeriod;
    for (u32 i = 0; i <= repeat_index; i++) {
@ -93,7 +93,7 @@ u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,

 static
 u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
-                         const u32 minPeriod, u64a rv) {
+                         const u32 minPeriod, u32 rv) {
    u32 cnt = 0;
    u32 patch_bits = 0;
    u32 total_size = 0;
@ -171,7 +171,7 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
        assert(minPeriod);
        assert(repeatMax.is_finite());
        {
-            u64a rv = repeatRecurTable(this, repeatMax, minPeriod);
+            u32 rv = repeatRecurTable(this, repeatMax, minPeriod);
            u32 repeatTmp = 0;
            if ((u32)repeatMax < minPeriod) {
                repeatTmp = repeatMax;
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@ -64,7 +64,7 @@ public:
              bool prefilter, const som_type som, ReportID rid, u64a min_offset,
              u64a max_offset, u64a min_length);

-    ~NGWrapper();
+    ~NGWrapper() override;

    /** index of the expression represented by this graph, used
     * - down the track in error handling
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@ -55,14 +55,14 @@ namespace ue2 {
 namespace {

 /** Distance value used to indicate that the vertex can't be reached. */
-static const int DIST_UNREACHABLE = INT_MAX;
+static constexpr int DIST_UNREACHABLE = INT_MAX;

 /**
 * Distance value used to indicate that the distance to a vertex is infinite
 * (for example, it's the max distance and there's a cycle in the path) or so
 * large that we should consider it effectively infinite.
 */
-static const int DIST_INFINITY = INT_MAX - 1;
+static constexpr int DIST_INFINITY = INT_MAX - 1;

 //
 // Filters
@ -71,10 +71,12 @@ static const int DIST_INFINITY = INT_MAX - 1;
 template <class GraphT>
 struct NodeFilter {
    typedef typename GraphT::edge_descriptor EdgeT;
-    NodeFilter() { }
+    NodeFilter() {} // BGL filters must be default-constructible.
    NodeFilter(const vector<bool> *bad_in, const GraphT *g_in)
        : bad(bad_in), g(g_in) { }
    bool operator()(const EdgeT &e) const {
+        assert(g && bad);
+
        u32 src_idx = (*g)[source(e, *g)].index;
        u32 tar_idx = (*g)[target(e, *g)].index;

@ -84,16 +86,20 @@ struct NodeFilter {

        return !(*bad)[src_idx] && !(*bad)[tar_idx];
    }
-    const vector<bool> *bad;
-    const GraphT *g;
+
+private:
+    const vector<bool> *bad = nullptr;
+    const GraphT *g = nullptr;
 };

 template <class GraphT>
 struct StartFilter {
    typedef typename GraphT::edge_descriptor EdgeT;
-    StartFilter() { }
+    StartFilter() {} // BGL filters must be default-constructible.
    explicit StartFilter(const GraphT *g_in) : g(g_in) { }
    bool operator()(const EdgeT &e) const {
+        assert(g);
+
        u32 src_idx = (*g)[source(e, *g)].index;
        u32 tar_idx = (*g)[target(e, *g)].index;

@ -107,7 +113,9 @@ struct StartFilter {
        }
        return true;
    }
-    const GraphT *g;
+
+private:
+    const GraphT *g = nullptr;
 };

 } // namespace
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@ -125,61 +125,62 @@ void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
 }

 static
-void fillStateBitset(const NGHolder &g, const set<NFAVertex> &in,
-                     dynamic_bitset<> &out) {
-    out.reset();
-    for (auto v : in) {
+dynamic_bitset<> makeStateBitset(const NGHolder &g,
+                                 const flat_set<NFAVertex> &in) {
+    dynamic_bitset<> work_states(num_vertices(g));
+    for (const auto &v : in) {
        u32 idx = g[v].index;
-        out.set(idx);
+        work_states.set(idx);
    }
+    return work_states;
 }

 static
-void fillVertexSet(const dynamic_bitset<> &in,
-                   const vector<StateInfo> &info, set<NFAVertex> &out) {
-    out.clear();
+flat_set<NFAVertex> getVertices(const dynamic_bitset<> &in,
+                                const vector<StateInfo> &info) {
+    flat_set<NFAVertex> out;
    for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
        out.insert(info[i].vertex);
    }
+    return out;
 }

 static
-void fillInfoTable(const NGHolder &g, vector<StateInfo> &info) {
-    info.resize(num_vertices(g));
+vector<StateInfo> makeInfoTable(const NGHolder &g) {
+    vector<StateInfo> info(num_vertices(g));
    for (auto v : vertices_range(g)) {
        u32 idx = g[v].index;
        const CharReach &cr = g[v].char_reach;
        assert(idx < info.size());
        info[idx] = StateInfo(v, cr);
    }
+    return info;
 }

-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   set<NFAVertex> *states, bool kill_sds) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial_states,
+                                  bool kill_sds) {
    assert(hasCorrectlyNumberedVertices(g));

-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);

    execute_graph_i(g, info, input, &work_states, kill_sds);

-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }

-void execute_graph(const NGHolder &g, const vector<CharReach> &input,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial_states) {
    assert(hasCorrectlyNumberedVertices(g));

-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);

    execute_graph_i(g, info, input, &work_states, false);

-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }

 typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
@ -276,9 +277,10 @@ private:
 };
 } // namespace

-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   const set<NFAVertex> &input_start_states,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial_states) {
    DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
                 num_vertices(running_g), num_vertices(input_dag));
    assert(hasCorrectlyNumberedVertices(running_g));
@ -290,10 +292,8 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
    RevNFAGraph revg(input_dag.g);
    map<NFAVertex, dynamic_bitset<> > dfs_states;

-    vector<StateInfo> info;
-    fillInfoTable(running_g, info);
-    dynamic_bitset<> input_fs(num_vertices(running_g));
-    fillStateBitset(running_g, *states, input_fs);
+    auto info = makeInfoTable(running_g);
+    auto input_fs = makeStateBitset(running_g, initial_states);

    for (auto v : input_start_states) {
        dfs_states[v] = input_fs;
@ -303,21 +303,25 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
                      eg_visitor(running_g, info, input_dag, dfs_states),
                      make_assoc_property_map(colours));

-    fillVertexSet(dfs_states[input_dag.accept], info, *states);
+    auto states = getVertices(dfs_states[input_dag.accept], info);

 #ifdef DEBUG
-        DEBUG_PRINTF("  output rstates:");
-        for (auto v : *states) {
-            printf(" %u", running_g[v].index);
-        }
-        printf("\n");
+    DEBUG_PRINTF("  output rstates:");
+    for (const auto &v : states) {
+        printf(" %u", running_g[v].index);
+    }
+    printf("\n");
 #endif
+
+    return states;
 }

-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   set<NFAVertex> *states) {
-    set<NFAVertex> input_start_states = {input_dag.start, input_dag.startDs};
-    execute_graph(running_g, input_dag, input_start_states, states);
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial_states) {
+    auto input_start_states = {input_dag.start, input_dag.startDs};
+    return execute_graph(running_g, input_dag, input_start_states,
+                         initial_states);
 }

 } // namespace ue2
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@ -35,8 +35,8 @@
 #define NG_EXECUTE_H

 #include "ng_holder.h"
+#include "util/ue2_containers.h"

-#include <set>
 #include <vector>

 namespace ue2 {
@ -44,23 +44,25 @@ namespace ue2 {
 class CharReach;
 struct ue2_literal;

-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   std::set<NFAVertex> *states, bool kill_sds = false);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial,
+                                  bool kill_sds = false);

-void execute_graph(const NGHolder &g, const std::vector<CharReach> &input,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const std::vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial);

 /** on exit, states contains any state which may still be enabled after
 * receiving an input which corresponds to some path through the input_dag from
 * start or startDs to accept. input_dag MUST be acyclic aside from self-loops.
 */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial);

 /* as above, but able to specify the source states for the input graph */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   const std::set<NFAVertex> &input_start_states,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial);

 } // namespace ue2

--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@ -114,7 +114,7 @@ void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) {
 }

 class Automaton_Base {
-public:
+protected:
    Automaton_Base(const NGHolder &graph_in,
                   const ue2::unordered_map<NFAVertex, u32> &state_ids_in)
        : graph(graph_in), state_ids(state_ids_in) {
@ -122,6 +122,7 @@ public:
        assert(alphasize <= ALPHABET_SIZE);
    }

+public:
    static bool canPrune(const flat_set<ReportID> &) { return false; }

    const NGHolder &graph;
@ -608,7 +609,6 @@ bool doHaig(const NGHolder &g,
    }

    haig_note_starts(g, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;

    return true;
 }
@ -638,7 +638,8 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
        return nullptr;
    }

-    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
+                                              somPrecision);

    DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
    bool rv;
@ -658,7 +659,6 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,

    DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n",
                 rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = somPrecision;

    assert(rdfa->kind == g.kind);
    return rdfa;
@ -782,7 +782,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df

    typedef Automaton_Haig_Merge::StateSet StateSet;
    vector<StateSet> nfa_state_map;
-    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
+                                              NODE_START,
+                                              dfas[0]->stream_som_loc_width);

    int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
    if (rv) {
@ -830,11 +832,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
    }

    haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;

    DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
                 rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = dfas[0]->stream_som_loc_width;

    return rdfa;
 }
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@ -98,8 +98,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
    info->packedCtrlSize = rsi.packedCtrlSize;
    info->horizon = rsi.horizon;
    info->minPeriod = minPeriod;
-    memcpy(&info->packedFieldSizes, rsi.packedFieldSizes.data(),
-           byte_length(rsi.packedFieldSizes));
+    copy_bytes(&info->packedFieldSizes, rsi.packedFieldSizes);
    info->patchCount = rsi.patchCount;
    info->patchSize = rsi.patchSize;
    info->encodingSize = rsi.encodingSize;
@ -122,7 +121,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
        nfa->length = verify_u32(len);
        info->length = verify_u32(sizeof(RepeatInfo)
                                  + sizeof(u64a) * (rsi.patchSize + 1));
-        memcpy(table, rsi.table.data(), byte_length(rsi.table));
+        copy_bytes(table, rsi.table);
    }
 }

--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@ -316,7 +316,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
    bool unbounded = false;
    bool exhaustible = can_exhaust(g, rm);

-    while (a) {
+    while (true) {
        if (is_special(a, g)) {
            DEBUG_PRINTF("stopped puffing due to special vertex\n");
            break;
@ -350,9 +350,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,

        a = getSoleSourceVertex(g, a);

-        if (!a) {
-            break;
-        }
+        assert(a); /* already checked that old a had a proper in degree of 1 */

        // Snark: we can't handle this case, because we can only handle a
        // single report ID on a vertex
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@ -266,7 +266,7 @@ bool validateEXSL(const NGHolder &g,
    const vector<CharReach> escapes_vec(1, escapes);
    const vector<CharReach> notescapes_vec(1, ~escapes);

-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states past the prefix */
    DEBUG_PRINTF("region %u is cutover\n", region);
    for (auto v : vertices_range(g)) {
@ -276,20 +276,20 @@ bool validateEXSL(const NGHolder &g,
    }

    /* process the escapes */
-    execute_graph(g, escapes_vec, &states);
+    states = execute_graph(g, escapes_vec, states);

    /* flood with any number of not escapes */
-    set<NFAVertex> prev_states;
+    ue2::flat_set<NFAVertex> prev_states;
    while (prev_states != states) {
        prev_states = states;
-        execute_graph(g, notescapes_vec, &states);
+        states = execute_graph(g, notescapes_vec, states);
        insert(&states, prev_states);
    }

    /* find input starts to use for when we are running the prefix through as
     * when the escape character arrives we may be in matching the prefix
     * already */
-    set<NFAVertex> prefix_start_states;
+    ue2::flat_set<NFAVertex> prefix_start_states;
    for (auto v : vertices_range(prefix)) {
        if (v != prefix.accept && v != prefix.acceptEod
            /* and as we have already made it past the prefix once */
@ -298,11 +298,12 @@ bool validateEXSL(const NGHolder &g,
        }
    }

-    execute_graph(prefix, escapes_vec, &prefix_start_states);
+    prefix_start_states =
+        execute_graph(prefix, escapes_vec, prefix_start_states);

    assert(contains(prefix_start_states, prefix.startDs));
    /* see what happens after we feed it the prefix */
-    execute_graph(g, prefix, prefix_start_states, &states);
+    states = execute_graph(g, prefix, prefix_start_states, states);

    for (auto v : states) {
        assert(v != g.accept && v != g.acceptEod); /* no cr -> should never be
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@ -136,7 +136,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
        return false;
    }

-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states (except starts - avoid suffix matches) */
    /* If we were doing (1) we would also except states leading to accepts -
       avoid prefix matches */
@ -149,7 +149,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
    }

    /* run the prefix the main graph */
-    execute_graph(p, p, &states);
+    states = execute_graph(p, p, states);

    for (auto v : states) {
        /* need to check if this vertex may represent an infix match - ie
@ -313,7 +313,7 @@ bool sentClearsTail(const NGHolder &g,
     */

    u32 first_bad_region = ~0U;
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states */
    DEBUG_PRINTF("region %u is cutover\n", last_head_region);
    for (auto v : vertices_range(g)) {
@ -327,7 +327,7 @@ bool sentClearsTail(const NGHolder &g,
    }

    /* run the prefix the main graph */
-    execute_graph(g, sent, &states);
+    states = execute_graph(g, sent, states);

    /* .. and check if we are left with anything in the tail region */
    for (auto v : states) {
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@ -51,10 +51,16 @@ namespace ue2 {

 namespace {

-/** Filter out edges from start-to-start or accept-to-accept. */
+/**
+ * Filter out special edges, or in the top-specific variant, start edges that
+ * don't have the right top set.
+ */
 struct SpecialEdgeFilter {
    SpecialEdgeFilter() {}
-    explicit SpecialEdgeFilter(const NGHolder *h_in) : h(h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in) : h(&h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
+        : h(&h_in), single_top(true), top(top_in) {}
+
    bool operator()(const NFAEdge &e) const {
        const NFAGraph &g = h->g;
        NFAVertex u = source(e, g), v = target(e, g);
@ -62,23 +68,33 @@ struct SpecialEdgeFilter {
            (is_any_accept(u, g) && is_any_accept(v, g))) {
            return false;
        }
+        if (single_top) {
+            if (u == h->start && g[e].top != top) {
+                return false;
+            }
+            if (u == h->startDs) {
+                return false;
+            }
+        }
        return true;

    }
 private:
    const NGHolder *h = nullptr;
+    bool single_top = false;
+    u32 top = 0;
 };

 } // namespace

 static
-depth findMinWidth(const NGHolder &h, NFAVertex src) {
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
    if (isLeafNode(src, h)) {
        return depth::unreachable();
    }

-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> StartGraph;
-    StartGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);

    assert(hasCorrectlyNumberedVertices(h));
    const size_t num = num_vertices(h);
@ -112,7 +128,8 @@ depth findMinWidth(const NGHolder &h, NFAVertex src) {
 }

 static
-depth findMaxWidth(const NGHolder &h, NFAVertex src) {
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
    if (isLeafNode(src, h.g)) {
        return depth::unreachable();
    }
@ -122,8 +139,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
        return depth::infinity();
    }

-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> NodeFilteredGraph;
-    NodeFilteredGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);

    assert(hasCorrectlyNumberedVertices(h));
    const size_t num = num_vertices(h);
@ -164,7 +180,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
    if (d.is_unreachable()) {
        // If we're actually reachable, we'll have a min width, so we can
        // return infinity in this case.
-        if (findMinWidth(h, src).is_reachable()) {
+        if (findMinWidth(h, filter, src).is_reachable()) {
            return depth::infinity();
        }
        return d;
@ -175,11 +191,10 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
    return d - depth(1);
 }

-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
-depth findMinWidth(const NGHolder &h) {
-    depth startDepth = findMinWidth(h, h.start);
-    depth dotstarDepth = findMinWidth(h, h.startDs);
+static
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMinWidth(h, filter, h.start);
+    depth dotstarDepth = findMinWidth(h, filter, h.startDs);
    DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                 dotstarDepth.str().c_str());
    if (startDepth.is_unreachable()) {
@ -194,11 +209,18 @@ depth findMinWidth(const NGHolder &h) {
    }
 }

-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
-depth findMaxWidth(const NGHolder &h) {
-    depth startDepth = findMaxWidth(h, h.start);
-    depth dotstarDepth = findMaxWidth(h, h.startDs);
+depth findMinWidth(const NGHolder &h) {
+    return findMinWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMinWidth(const NGHolder &h, u32 top) {
+    return findMinWidth(h, SpecialEdgeFilter(h, top));
+}
+
+static
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMaxWidth(h, filter, h.start);
+    depth dotstarDepth = findMaxWidth(h, filter, h.startDs);
    DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                 dotstarDepth.str().c_str());
    if (startDepth.is_unreachable()) {
@ -210,4 +232,12 @@ depth findMaxWidth(const NGHolder &h) {
    }
 }

+depth findMaxWidth(const NGHolder &h) {
+    return findMaxWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMaxWidth(const NGHolder &h, u32 top) {
+    return findMaxWidth(h, SpecialEdgeFilter(h, top));
+}
+
 } // namespace ue2
--- a/src/nfagraph/ng_width.h
+++ b/src/nfagraph/ng_width.h
@ -41,14 +41,34 @@ namespace ue2 {

 class NGHolder;

-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph.
+ */
 depth findMinWidth(const NGHolder &h);

-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ */
+depth findMinWidth(const NGHolder &h, u32 top);
+
+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
 depth findMaxWidth(const NGHolder &h);

+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
+depth findMaxWidth(const NGHolder &h, u32 top);
+
 } // namespace ue2

 #endif // NG_WIDTH_H
--- a/src/parser/AsciiComponentClass.cpp
+++ b/src/parser/AsciiComponentClass.cpp
@ -52,7 +52,8 @@ AsciiComponentClass *AsciiComponentClass::clone() const {
 }

 bool AsciiComponentClass::class_empty(void) const {
-    return cr.none() && cr_ucp.none();
+    assert(finalized);
+    return cr.none();
 }

 void AsciiComponentClass::createRange(unichar to) {
@ -60,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
    unsigned char from = (u8)range_start;
    if (from > to) {
        throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        cr.setRange(from, to);
-        range_start = INVALID_UNICODE;
    }
+
+    in_cand_range = false;
+    CharReach ncr(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+    cr |= ncr;
+    range_start = INVALID_UNICODE;
 }

 void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@ -94,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
        c = translateForUcpMode(c, mode);
    }

+    // Note: caselessness is handled by getPredefinedCharReach.
    CharReach pcr = getPredefinedCharReach(c, mode);
    if (negative) {
        pcr.flip();
    }

-    if (isUcp(c)) {
-        cr_ucp |= pcr;
-    } else {
-        cr |= pcr;
-    }
+    cr |= pcr;
    range_start = INVALID_UNICODE;
    in_cand_range = false;
 }
@ -119,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
        return;
    }

-    cr.set(c);
+    CharReach ncr(c, c);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+
+    cr |= ncr;
    range_start = c;
 }

@ -135,12 +142,6 @@ void AsciiComponentClass::finalize() {
        in_cand_range = false;
    }

-    if (mode.caseless) {
-        make_caseless(&cr);
-    }
-
-    cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
-
    if (m_negate) {
        cr.flip();
    }
--- a/src/parser/AsciiComponentClass.h
+++ b/src/parser/AsciiComponentClass.h
@ -78,12 +78,10 @@ protected:
 private:
    Position position;
    CharReach cr;
-    CharReach cr_ucp;

    // Private copy ctor. Use clone instead.
    AsciiComponentClass(const AsciiComponentClass &other)
-        : ComponentClass(other), position(other.position), cr(other.cr),
-          cr_ucp(other.cr_ucp) {}
+        : ComponentClass(other), position(other.position), cr(other.cr) {}
 };

 } // namespace ue2
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
    case CLASS_DIGIT:
        return number;
    case CLASS_GRAPH:
-    case CLASS_XGRAPH:
        return CharReach(0x21, 0x7e);
+    case CLASS_XGRAPH:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_HORZ:
        return CharReach("\x09\x20\xA0");
    case CLASS_LOWER:
@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
        }
    case CLASS_PRINT:
        return CharReach(0x20, 0x7e);
+    case CLASS_XPRINT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_PUNCT:
        return CharReach(0x21, '0' - 1)
            | CharReach('9' + 1, 'A' - 1)
            | CharReach('Z' + 1, 'a' - 1)
            | CharReach('z' + 1, 126);
+    case CLASS_XPUNCT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_SPACE:
        return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
    case CLASS_UPPER:
@ -420,7 +425,7 @@ unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,

 ComponentClass::ComponentClass(const ParseMode &mode_in)
    : m_negate(false), mode(mode_in), in_cand_range(false),
-      range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
+      range_start(INVALID_UNICODE), finalized(false) {}

 ComponentClass::~ComponentClass() { }

@ -441,7 +446,6 @@ void ComponentClass::addDash(void) {
 }

 void ComponentClass::negate() {
-    assert(class_empty());
    m_negate = true;
 }

--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@ -63,7 +63,9 @@ enum PredefinedClass {
    CLASS_VERT,
    CLASS_WORD,
    CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
+    CLASS_XPRINT, /* [:print:] in UCP mode */
+    CLASS_XPUNCT, /* [:punct:] in UCP mode */
    CLASS_UCP_C,
    CLASS_UCP_CC,
    CLASS_UCP_CF,
@ -232,8 +234,12 @@ public:
    Component *accept(ComponentVisitor &v) override = 0;
    void accept(ConstComponentVisitor &v) const override = 0;

-     /** True iff we have already started adding members to the class. This is
-      * a different concept to Component::empty */
+    /** \brief True if the class contains no members (i.e. it will not match
+     * against anything). This function can only be called on a finalized
+     * class.
+     *
+     * Note: This is a different concept to Component::empty.
+     */
    virtual bool class_empty(void) const = 0;

    virtual void add(PredefinedClass c, bool negated) = 0;
@ -245,9 +251,6 @@ public:

    bool isNegated() const { return m_negate; }

-    void setFirstChar(char c) { firstChar = c; }
-    char getFirstChar() const { return firstChar; }
-
    std::vector<PositionInfo> first() const override = 0;
    std::vector<PositionInfo> last() const override = 0;
    bool empty() const override { return false; } /* always 1 codepoint wide */
@ -263,19 +266,13 @@ protected:
    unichar range_start;
    bool finalized;

-    /** Literal character at the start of this character class, e.g. '.' for
-     * the class [.abc]. Used to identify (unsupported) POSIX collating
-     * elements. */
-    char firstChar;
-
    virtual void createRange(unichar) = 0;

    // Protected copy ctor. Use clone instead.
    ComponentClass(const ComponentClass &other)
        : Component(other), m_negate(other.m_negate), mode(other.mode),
          in_cand_range(other.in_cand_range), range_start(other.range_start),
-          finalized(other.finalized),
-          firstChar(other.firstChar) {}
+          finalized(other.finalized) {}
 };

 } // namespace ue2
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
        assert(!inCharClass); // not reentrant
        currentCls = getComponentClass(mode);
        inCharClass = true;
+        inCharClassEarly = true;
        currentClsBegin = ts;
        fgoto readClass;
    }
@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
    }
    action is_utf8 { mode.utf8 }
    action is_ignore_space { mode.ignore_space }
+    action is_early_charclass { inCharClassEarly }

    action addNumberedBackRef {
        if (accumulator == 0) {
@ -790,10 +792,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
        any => { throw LocatedParseError("Unknown property"); };
                     *|;
    charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
-              '\[\.' [^\]]* '\.\]' | 
-              '\[=' [^\]]* '=\]' => {
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
              };
              # Named sets
              # Adding these may cause the charclass to close, hence the
@ -889,11 +893,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throw LocatedParseError("Invalid POSIX named class");
              };
              '\\Q' => {
-                  // fcall readQuotedClass;
-                  ostringstream str;
-                  str << "\\Q..\\E sequences in character classes not supported at index "
-                      << ts - ptr << ".";
-                  throw ParseError(str.str());
+                  fcall readQuotedClass;
              };
              '\\E' => { /*noop*/};
              # Backspace (this is only valid for \b in char classes)
@ -1090,28 +1090,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throwInvalidUtf8();
              };

-              # dot or equals at the end of a character class could be the end
-              # of a collating element, like [.blah.] or [=blah=].
-              [.=] ']' => {
-                  if (currentCls->getFirstChar() == *ts) {
-                      assert(currentClsBegin);
-                      ostringstream oss;
-                      oss << "Unsupported POSIX collating element at index "
-                          << currentClsBegin - ptr << ".";
-                      throw ParseError(oss.str());
-                  }
-                  currentCls->add(*ts);
-                  currentCls->finalize();
-                  currentSeq->addComponent(move(currentCls));
-                  inCharClass = false;
-                  fgoto main;
-              };
-
              # Literal character
              (any - ']') => {
-                  if (currentCls->class_empty()) {
-                      currentCls->setFirstChar(*ts);
-                  }
                  currentCls->add(*ts);
              };

@ -1127,35 +1107,35 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
    # Parser to read stuff from a character class
    #############################################################
    readClass := |*
-        # the negate and right bracket out the front are special
-        '\^' => {
+        # A caret at the beginning of the class means that the rest of the
+        # class is negated.
+        '\^' when is_early_charclass => {
            if (currentCls->isNegated()) {
+                // Already seen a caret; the second one is not a meta-character.
+                inCharClassEarly = false;
                fhold; fgoto charClassGuts;
            } else {
                currentCls->negate();
+                // Note: we cannot switch off inCharClassEarly here, as /[^]]/
+                // needs to use the right square bracket path below.
            }
        };
-        ']' => {
-            // if this is the first thing in the class, add it and move along,
-            // otherwise jump into the char class machine to handle what might
-            // end up as fail
-            if (currentCls->class_empty()) {
-                currentCls->add(']');
-            } else {
-                // leave it for the next machine
-                fhold;
-            }
-            fgoto charClassGuts;
+        # A right square bracket before anything "real" is interpreted as a
+        # literal right square bracket.
+        ']' when is_early_charclass => {
+            currentCls->add(']');
+            inCharClassEarly = false;
        };
        # if we hit a quote before anything "real", handle it
-        #'\\Q' => { fcall readQuotedClass; };
-        '\\Q' => {
-            throw LocatedParseError("\\Q..\\E sequences in character classes not supported");
-        };
+        '\\Q' => { fcall readQuotedClass; };
        '\\E' => { /*noop*/};

        # time for the real work to happen
-        any => { fhold; fgoto charClassGuts; };
+        any => {
+            inCharClassEarly = false;
+            fhold;
+            fgoto charClassGuts;
+        };
        *|;

    #############################################################
@ -1183,6 +1163,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
              # Literal character
              any => {
                  currentCls->add(*ts);
+                  inCharClassEarly = false;
              };
            *|;

@ -1232,6 +1213,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throw LocatedParseError("POSIX named classes are only "
                                          "supported inside a class");
              };
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
+              };
              # Begin eating characters for class
              '\[' => eatClass;
              # Begin quoted literal
@ -1896,6 +1884,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
    // brackets [..].
    bool inCharClass = false;

+    // True if the machine is inside a character class but it has not processed
+    // any "real" elements yet, i.e. it's still processing meta-characters like
+    // '^'.
+    bool inCharClassEarly = false;
+
    // Location at which the current character class began.
    const u8 *currentClsBegin = p;

--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
        } else {
            return CLASS_UCP_LL;
        }
+    case CLASS_PRINT:
+        return CLASS_XPRINT;
+    case CLASS_PUNCT:
+        return CLASS_XPUNCT;
    case CLASS_SPACE:
        return CLASS_UCP_XPS;
    case CLASS_UPPER:
@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
    }
 }

-static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                       const ParseMode &mode) {
    /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@ -117,6 +120,22 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
        rv |= cf;
        return rv;
    }
+    case CLASS_XPRINT: {
+        // Same as graph, plus everything with the Zs property.
+        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+        rv |= getUcpZs();
+        rv.set(0x180e); // Also included in this class by PCRE 8.38.
+        return rv;
+    }
+    case CLASS_XPUNCT: {
+        // Everything with the P (punctuation) property, plus code points in S
+        // (symbols) that are < 128.
+        CodePointSet rv = getUcpP();
+        CodePointSet symbols = getUcpS();
+        symbols.unsetRange(128, MAX_UNICODE);
+        rv |= symbols;
+        return rv;
+    }
    case CLASS_HORZ: {
        CodePointSet rv;
        rv.set(0x0009); /* Horizontal tab */
@ -484,7 +503,8 @@ UTF8ComponentClass *UTF8ComponentClass::clone() const {
 }

 bool UTF8ComponentClass::class_empty(void) const {
-    return cps.none() && cps_ucp.none();
+    assert(finalized);
+    return cps.none();
 }

 void UTF8ComponentClass::createRange(unichar to) {
@ -492,16 +512,16 @@ void UTF8ComponentClass::createRange(unichar to) {
    unichar from = range_start;
    if (from > to) {
        throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        CodePointSet ncps;
-        ncps.setRange(from, to);
-        if (mode.caseless) {
-            make_caseless(&ncps);
-        }
-        cps |= ncps;
-        range_start = INVALID_UNICODE;
    }
+
+    in_cand_range = false;
+    CodePointSet ncps;
+    ncps.setRange(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncps);
+    }
+    cps |= ncps;
+    range_start = INVALID_UNICODE;
 }

 void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
@ -520,11 +540,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
        pcps.flip();
    }

-    if (isUcp(c)) {
-        cps_ucp |= pcps;
-    } else {
-        cps |= pcps;
-    }
+    cps |= pcps;

    range_start = INVALID_UNICODE;
    in_cand_range = false;
@ -562,8 +578,6 @@ void UTF8ComponentClass::finalize() {
        in_cand_range = false;
    }

-    cps |= cps_ucp; /* characters from ucp props always case sensitive */
-
    if (m_negate) {
        cps.flip();
    }
@ -571,31 +585,6 @@ void UTF8ComponentClass::finalize() {
    finalized = true;
 }

-bool isUcp(PredefinedClass c) {
-    switch (c) {
-    case CLASS_ALNUM:
-    case CLASS_ALPHA:
-    case CLASS_ANY:
-    case CLASS_ASCII:
-    case CLASS_BLANK:
-    case CLASS_CNTRL:
-    case CLASS_DIGIT:
-    case CLASS_GRAPH:
-    case CLASS_HORZ:
-    case CLASS_LOWER:
-    case CLASS_PRINT:
-    case CLASS_PUNCT:
-    case CLASS_SPACE:
-    case CLASS_UPPER:
-    case CLASS_VERT:
-    case CLASS_WORD:
-    case CLASS_XDIGIT:
-        return false;
-    default:
-        return true;
-    }
-}
-
 Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
    map<u8, Position>::const_iterator it = heads.find(first_byte);
    if (it != heads.end()) {
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@ -93,7 +93,6 @@ private:
    void buildFourByte(GlushkovBuildState &bs);

    CodePointSet cps;
-    CodePointSet cps_ucp;

    std::map<u8, Position> heads;
    Position single_pos;
@ -108,7 +107,9 @@ private:
 };

 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
-bool isUcp(PredefinedClass c);
+
+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+                                       const ParseMode &mode);

 } // namespace

--- a/src/parser/check_refs.cpp
+++ b/src/parser/check_refs.cpp
@ -57,7 +57,7 @@ public:
    ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
        : num_ids(num_groups), names(targets) {}

-    ~ReferenceVisitor();
+    ~ReferenceVisitor() override;

    void invalid_index(const char *component, unsigned id) {
        assert(component);
--- a/src/parser/prefilter.cpp
+++ b/src/parser/prefilter.cpp
@ -201,7 +201,7 @@ const ComponentSequence *findCapturingGroup(const Component *root,
 class PrefilterVisitor : public DefaultComponentVisitor {
 public:
    PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
-    ~PrefilterVisitor();
+    ~PrefilterVisitor() override;

    /** \brief Calls the visitor (recursively) on a new replacement component
     * we've just created. Takes care of freeing it if the sequence is itself
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@ -64,7 +64,7 @@ namespace ue2 {
 */
 class ConstructLiteralVisitor : public ConstComponentVisitor {
 public:
-    ~ConstructLiteralVisitor();
+    ~ConstructLiteralVisitor() override;

    /** \brief Thrown if this component does not represent a literal. */
    struct NotLiteral {};
--- a/src/parser/unsupported.cpp
+++ b/src/parser/unsupported.cpp
@ -44,7 +44,7 @@ namespace ue2 {
 * an unsupported component. */
 class UnsupportedVisitor : public DefaultConstComponentVisitor {
 public:
-    ~UnsupportedVisitor();
+    ~UnsupportedVisitor() override;
    void pre(const ComponentAssertion &) override {
        throw ParseError("Zero-width assertions are not supported.");
    }
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@ -379,7 +379,7 @@ void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) {
    DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi);
    if (final_loc >= q_last_loc(q)) {
        /* TODO: ensure situation does not arise */
-        assert(q->items[q->end - 1].type != MQE_END);
+        assert(q_last_type(q) != MQE_END);
        pushQueueNoMerge(q, MQE_END, final_loc);
    }
 }
--- a/src/rose/match.c
+++ b/src/rose/match.c
@ -758,7 +758,7 @@ found_miracle:

    q_skip_forward_to(q, miracle_loc);

-    if (q->items[q->end - 1].type == MQE_START) {
+    if (q_last_type(q) == MQE_START) {
        DEBUG_PRINTF("miracle caused infix to die\n");
        return 0;
    }
@ -853,7 +853,7 @@ char roseTestLeftfix(const struct RoseEngine *t, const struct RoseRole *tr,
        }
    }

-    if (q_cur_loc(q) < loc || q->items[q->end - 1].type != MQE_START) {
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
        if (left->infix) {
            if (infixTooOld(q, loc)) {
                DEBUG_PRINTF("infix %u died of old age\n", ri);
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@ -42,6 +42,7 @@
 #include "rose_in_graph.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
+#include "util/ue2_containers.h"
 #include "util/ue2string.h"

 #include <memory>
@ -72,8 +73,8 @@ public:

    /** \brief True if we can not establish that at most a single callback will
     * be generated at a given offset from this set of reports. */
-    virtual bool requiresDedupeSupport(const std::set<ReportID> &reports) const
-        = 0;
+    virtual bool requiresDedupeSupport(const ue2::flat_set<ReportID> &reports)
+        const = 0;
 };

 /** \brief Abstract interface intended for callers from elsewhere in the tree,
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@ -271,16 +271,13 @@ public:
    typedef Holder_StateSet StateSet;
    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;

-    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in), bad(false) {
+    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
        for (auto v : vertices_range(g)) {
            vertexToIndex[v] = indexToVertex.size();
            indexToVertex.push_back(v);
        }

-        if (indexToVertex.size() > ANCHORED_NFA_STATE_LIMIT) {
-            bad = true;
-            return;
-        }
+        assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);

        DEBUG_PRINTF("%zu states\n", indexToVertex.size());
        init.wdelay = 0;
@ -400,7 +397,6 @@ public:
    array<u16, ALPHABET_SIZE> alpha;
    array<u16, ALPHABET_SIZE> unalpha;
    u16 alphasize;
-    bool bad;
 };

 } // namespace
@ -670,13 +666,13 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,

 static
 int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
-    Automaton_Holder autom(h);
-
-    if (autom.bad) {
+    if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
        DEBUG_PRINTF("autom bad!\n");
        return ANCHORED_FAIL;
    }

+    Automaton_Holder autom(h);
+
    unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
    if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
        return finalise_out(tbi, h, autom, move(out_dfa), remap);
@ -738,7 +734,6 @@ void buildSimpleDfas(const RoseBuildImpl &tbi,
        NGHolder h;
        populate_holder(simple.first, exit_ids, &h);
        Automaton_Holder autom(h);
-        assert(!autom.bad);
        unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
        UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
        assert(!rv);
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@ -2687,12 +2687,6 @@ void fillInReportInfo(RoseEngine *engine, u32 reportOffset,
                 sizeof(internal_report));
 }

-static
-void populateInvDkeyTable(char *ptr, const ReportManager &rm) {
-    vector<ReportID> table = rm.getDkeyToReportTable();
-    memcpy(ptr, table.data(), byte_length(table));
-}
-
 static
 bool hasSimpleReports(const vector<Report> &reports) {
    auto it = find_if(reports.begin(), reports.end(), isComplexReport);
@ -4154,7 +4148,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
    engine->ekeyCount = rm.numEkeys();
    engine->dkeyCount = rm.numDkeys();
    engine->invDkeyOffset = dkeyOffset;
-    populateInvDkeyTable(ptr + dkeyOffset, rm);
+    copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());

    engine->somHorizon = ssm.somPrecision();
    engine->somLocationCount = ssm.numSomSlots();
@ -4314,33 +4308,22 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
    buildLitBenefits(*this, engine.get(), base_lits_benefits_offset);

    // Copy in other tables
-    memcpy(ptr + bc.engine_blob_base, bc.engine_blob.data(),
-           byte_length(bc.engine_blob));
-
-    memcpy(ptr + engine->literalOffset, literalTable.data(),
-           byte_length(literalTable));
-    memcpy(ptr + engine->roleOffset, bc.roleTable.data(),
-           byte_length(bc.roleTable));
-    copy(leftInfoTable.begin(), leftInfoTable.end(),
-         (LeftNfaInfo *)(ptr + engine->leftOffset));
+    copy_bytes(ptr + bc.engine_blob_base, bc.engine_blob);
+    copy_bytes(ptr + engine->literalOffset, literalTable);
+    copy_bytes(ptr + engine->roleOffset, bc.roleTable);
+    copy_bytes(ptr + engine->leftOffset, leftInfoTable);

    fillLookaroundTables(ptr + lookaroundTableOffset,
                         ptr + lookaroundReachOffset, bc.lookaround);

    fillInSomRevNfas(engine.get(), ssm, rev_nfa_table_offset, rev_nfa_offsets);
-    memcpy(ptr + engine->predOffset, predTable.data(), byte_length(predTable));
-    memcpy(ptr + engine->rootRoleOffset, rootRoleTable.data(),
-           byte_length(rootRoleTable));
-    memcpy(ptr + engine->anchoredReportMapOffset, art.data(), byte_length(art));
-    memcpy(ptr + engine->anchoredReportInverseMapOffset, arit.data(),
-           byte_length(arit));
-    memcpy(ptr + engine->multidirectOffset, mdr_reports.data(),
-           byte_length(mdr_reports));
-
-    copy(activeLeftIter.begin(), activeLeftIter.end(),
-         (mmbit_sparse_iter *)(ptr + engine->activeLeftIterOffset));
-
-    memcpy(ptr + engine->sideOffset, sideTable.data(), byte_length(sideTable));
+    copy_bytes(ptr + engine->predOffset, predTable);
+    copy_bytes(ptr + engine->rootRoleOffset, rootRoleTable);
+    copy_bytes(ptr + engine->anchoredReportMapOffset, art);
+    copy_bytes(ptr + engine->anchoredReportInverseMapOffset, arit);
+    copy_bytes(ptr + engine->multidirectOffset, mdr_reports);
+    copy_bytes(ptr + engine->activeLeftIterOffset, activeLeftIter);
+    copy_bytes(ptr + engine->sideOffset, sideTable);

    DEBUG_PRINTF("rose done %p\n", engine.get());
    return engine;
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@ -1631,20 +1631,23 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
    assert(left.graph());
    const NGHolder &h = *left.graph();

+    ue2::flat_set<NFAVertex> all_states;
+    insert(&all_states, vertices(h));
+    assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
+    DEBUG_PRINTF("removing sds\n");
+    all_states.erase(h.startDs);
+
+    ue2::flat_set<NFAVertex> states;
+
    /* check each pred literal to see if they all kill previous graph
     * state */
    for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
        const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
        const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);

-        set<NFAVertex> states;
-        insert(&states, vertices(h));
-        assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
-        DEBUG_PRINTF("removing sds\n");
-        states.erase(h.startDs);
        DEBUG_PRINTF("running graph %zu\n", states.size());
-        execute_graph(h, s, &states, true);
-        DEBUG_PRINTF("ran\n");
+        states = execute_graph(h, s, all_states, true);
+        DEBUG_PRINTF("ran, %zu states on\n", states.size());

        if (!states.empty()) {
            return false;
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@ -130,6 +130,8 @@ private:

    friend depth findMinWidth(const suffix_id &s);
    friend depth findMaxWidth(const suffix_id &s);
+    friend depth findMinWidth(const suffix_id &s, u32 top);
+    friend depth findMaxWidth(const suffix_id &s, u32 top);
 };

 std::set<ReportID> all_reports(const suffix_id &s);
@ -138,6 +140,8 @@ bool has_eod_accepts(const suffix_id &s);
 bool has_non_eod_accepts(const suffix_id &s);
 depth findMinWidth(const suffix_id &s);
 depth findMaxWidth(const suffix_id &s);
+depth findMinWidth(const suffix_id &s, u32 top);
+depth findMaxWidth(const suffix_id &s, u32 top);
 size_t hash_value(const suffix_id &s);

 /** \brief represents an engine to the left of a rose role */
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@ -77,6 +77,8 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
      hasSom(false),
      group_weak_end(0),
      group_end(0),
+      anchored_base_id(MO_INVALID_IDX),
+      nonbenefits_base_id(MO_INVALID_IDX),
      ematcher_region_size(0),
      floating_direct_report(false),
      eod_event_literal_id(MO_INVALID_IDX),
@ -536,7 +538,7 @@ u32 RoseBuildImpl::getNewLiteralId() {
 }

 static
-bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
+bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                    const Grey &grey) {
    /* TODO: tighten */
    NFAVertex seen_vert = NFAGraph::null_vertex();
@ -579,13 +581,14 @@ bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
 class RoseDedupeAuxImpl : public RoseDedupeAux {
 public:
    explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
-    bool requiresDedupeSupport(const set<ReportID> &reports) const override;
+    bool requiresDedupeSupport(
+        const ue2::flat_set<ReportID> &reports) const override;

    const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex> > vert_map;
-    map<ReportID, set<suffix_id> > suffix_map;
-    map<ReportID, set<const OutfixInfo *> > outfix_map;
-    map<ReportID, set<const raw_puff *> > puff_map;
+    map<ReportID, set<RoseVertex>> vert_map;
+    map<ReportID, set<suffix_id>> suffix_map;
+    map<ReportID, set<const OutfixInfo *>> outfix_map;
+    map<ReportID, set<const raw_puff *>> puff_map;
 };

 unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
@ -599,6 +602,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
    : tbi(tbi_in) {
    const RoseGraph &g = tbi.g;

+    set<suffix_id> suffixes;
+
    for (auto v : vertices_range(g)) {
        // Literals in the small block table don't count as dupes: although
        // they have copies in the anchored table, the two are never run in the
@ -609,10 +614,16 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
            }
        }

+        // Several vertices may share a suffix, so we collect the set of
+        // suffixes first to avoid repeating work.
        if (g[v].suffix) {
-            for (const auto &report_id : all_reports(g[v].suffix)) {
-                suffix_map[report_id].insert(g[v].suffix);
-            }
+            suffixes.insert(g[v].suffix);
+        }
+    }
+
+    for (const auto &suffix : suffixes) {
+        for (const auto &report_id : all_reports(suffix)) {
+            suffix_map[report_id].insert(suffix);
        }
    }

@ -634,8 +645,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
    }
 }

-bool RoseDedupeAuxImpl::requiresDedupeSupport(const set<ReportID> &reports)
-    const {
+bool RoseDedupeAuxImpl::requiresDedupeSupport(
+    const ue2::flat_set<ReportID> &reports) const {
    /* TODO: this could be expanded to check for offset or character
       constraints */

@ -897,6 +908,17 @@ depth findMinWidth(const suffix_id &s) {
    }
 }

+depth findMinWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    if (s.graph()) {
+        return findMinWidth(*s.graph(), top);
+    } else if (s.castle()) {
+        return findMinWidth(*s.castle(), top);
+    } else {
+        return s.dfa_min_width;
+    }
+}
+
 depth findMaxWidth(const suffix_id &s) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
@ -908,6 +930,17 @@ depth findMaxWidth(const suffix_id &s) {
    }
 }

+depth findMaxWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    if (s.graph()) {
+        return findMaxWidth(*s.graph(), top);
+    } else if (s.castle()) {
+        return findMaxWidth(*s.castle(), top);
+    } else {
+        return s.dfa_max_width;
+    }
+}
+
 bool has_eod_accepts(const suffix_id &s) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@ -439,12 +439,16 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
    hash_combine(val, hash_range(begin(props.reports), end(props.reports)));

    if (props.suffix) {
-        hash_combine(val, all_reports(props.suffix));
-        if (props.suffix.graph) {
-            hash_combine(val, num_vertices(*props.suffix.graph));
+        const auto &suffix = props.suffix;
+        if (suffix.castle) {
+            hash_combine(val, suffix.castle->reach());
+            hash_combine(val, suffix.castle->repeats.size());
        }
-        if (props.suffix.haig) {
-            hash_combine(val, hash_dfa(*props.suffix.haig));
+        if (suffix.graph) {
+            hash_combine(val, num_vertices(*suffix.graph));
+        }
+        if (suffix.haig) {
+            hash_combine(val, hash_dfa(*suffix.haig));
        }
    }

@ -747,14 +751,17 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
 * Castle. */
 static
 void pruneCastle(CastleProto &castle, ReportID report) {
-    for (map<u32, PureRepeat>::iterator it = castle.repeats.begin();
-         it != castle.repeats.end(); /* incr inside */) {
-        if (contains(it->second.reports, report)) {
-            ++it;
-        } else {
-            castle.repeats.erase(it++);
+    unordered_set<u32> dead; // tops to remove.
+    for (const auto &m : castle.repeats) {
+        if (!contains(m.second.reports, report)) {
+            dead.insert(m.first);
        }
    }
+
+    for (const auto &top : dead) {
+        castle.erase(top);
+    }
+
    assert(!castle.repeats.empty());
 }

@ -794,7 +801,7 @@ void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
    for (u32 top : assoc_keys(castle.repeats)) {
        if (!contains(used_tops, top)) {
            DEBUG_PRINTF("removing unused top %u\n", top);
-            castle.repeats.erase(top);
+            castle.erase(top);
        }
    }
 }
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@ -94,10 +94,11 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
        }

        if (g[v].suffix) {
-            depth suffix_width = findMinWidth(g[v].suffix);
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
            assert(suffix_width.is_reachable());
-            DEBUG_PRINTF("%zu has suffix (width %s), can fire report at %u\n",
-                         g[v].idx, suffix_width.str().c_str(),
+            DEBUG_PRINTF("%zu has suffix with top %u (width %s), can fire "
+                         "report at %u\n",
+                         g[v].idx, g[v].suffix.top, suffix_width.str().c_str(),
                         w + suffix_width);
            minWidth = min(minWidth, w + suffix_width);
        }
@ -146,8 +147,9 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi) {
            if (has_non_eod_accepts(g[v].suffix)) {
                return ROSE_BOUND_INF;
            }
-            depth suffix_width = findMaxWidth(g[v].suffix);
-            DEBUG_PRINTF("suffix max width %s\n", suffix_width.str().c_str());
+            depth suffix_width = findMaxWidth(g[v].suffix, g[v].suffix.top);
+            DEBUG_PRINTF("suffix max width for top %u is %s\n", g[v].suffix.top,
+                         suffix_width.str().c_str());
            assert(suffix_width.is_reachable());
            if (!suffix_width.is_finite()) {
                DEBUG_PRINTF("suffix too wide\n");
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@ -167,7 +167,7 @@ found_miracle:

        DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc);
        q_skip_forward_to(q, miracle_loc);
-        if (q->items[q->end - 1].type == MQE_START) {
+        if (q_last_type(q) == MQE_START) {
            DEBUG_PRINTF("miracle caused infix to die\n");
            return MIRACLE_DEAD;
        }
--- a/src/util/compare.h
+++ b/src/util/compare.h
@ -98,18 +98,22 @@ u64a theirtoupper64(const u64a x) {
 static really_inline
 int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) {
    const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (mytolower(*p1) != mytolower(*p2))
+    for (; p1 < pEnd; p1++, p2++) {
+        if (mytolower(*p1) != mytolower(*p2)) {
            return 1;
+        }
+    }
    return 0;
 }

 static really_inline
 int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) {
    const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (*p1 != *p2)
+    for (; p1 < pEnd; p1++, p2++) {
+        if (*p1 != *p2) {
            return 1;
+        }
+    }
    return 0;
 }

--- a/src/util/container.h
+++ b/src/util/container.h
@ -33,8 +33,13 @@
 #ifndef UTIL_CONTAINER_H
 #define UTIL_CONTAINER_H

+#include "ue2common.h"
+
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 #include <set>
+#include <type_traits>
 #include <utility>

 namespace ue2 {
@ -92,11 +97,35 @@ std::set<typename C::key_type> assoc_keys(const C &container) {
    return keys;
 }

+/**
+ * \brief Return the length in bytes of the given vector of (POD) objects.
+ */
 template<typename T>
 typename std::vector<T>::size_type byte_length(const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
    return vec.size() * sizeof(T);
 }

+/**
+ * \brief Copy the given vector of POD objects to the given location in memory.
+ * It is safe to give this function an empty vector.
+ */
+template<typename T>
+void *copy_bytes(void *dest, const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
+    assert(dest);
+
+    // Since we're generally using this function to write into the bytecode,
+    // dest should be appropriately aligned for T.
+    assert(ISALIGNED_N(dest, alignof(T)));
+
+    if (vec.empty()) {
+        return dest; // Protect memcpy against null pointers.
+    }
+    assert(vec.data() != nullptr);
+    return std::memcpy(dest, vec.data(), byte_length(vec));
+}
+
 template<typename OrderedContainer1, typename OrderedContainer2>
 bool is_subset_of(const OrderedContainer1 &small, const OrderedContainer2 &big) {
    static_assert(std::is_same<typename OrderedContainer1::value_type,
--- a/src/util/depth.h
+++ b/src/util/depth.h
@ -183,7 +183,7 @@ public:

        s64a rv = val + d;
        if (rv < 0 || (u64a)rv >= val_infinity) {
-            DEBUG_PRINTF("depth %llu too large to represent!\n", rv);
+            DEBUG_PRINTF("depth %lld too large to represent!\n", rv);
            throw DepthOverflowError();
        }

--- a/src/util/multibit.c
+++ b/src/util/multibit.c
@ -142,23 +142,25 @@ const u32 mmbit_root_offset_from_level[7] = {
 u32 mmbit_size(u32 total_bits) {
    MDEBUG_PRINTF("%u\n", total_bits);

-    // UE-2228: multibit has bugs in very, very large cases that we should be
-    // protected against at compile time by resource limits.
-    assert(total_bits <= 1U << 30);
-
    // Flat model multibit structures are just stored as a bit vector.
    if (total_bits <= MMB_FLAT_MAX_BITS) {
        return ROUNDUP_N(total_bits, 8) / 8;
    }

-    u32 current_level = 1;
-    u32 total = 0;
+    u64a current_level = 1; // Number of blocks on current level.
+    u64a total = 0;         // Total number of blocks.
    while (current_level * MMB_KEY_BITS < total_bits) {
        total += current_level;
        current_level <<= MMB_KEY_SHIFT;
    }
-    total += (total_bits + MMB_KEY_BITS - 1)/MMB_KEY_BITS;
-    return sizeof(MMB_TYPE) * total;
+
+    // Last level is a one-for-one bit vector. It needs room for total_bits
+    // elements, rounded up to the nearest block.
+    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
+    total += last_level;
+
+    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
+    return (u32)(total * sizeof(MMB_TYPE));
 }

 #ifdef DUMP_SUPPORT
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@ -235,18 +235,18 @@ const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) {
 /** \brief get the block for this key on the current level as a u8 ptr */
 static really_inline
 u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
-    return mmbit_get_level_root(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    u8 *level_root = mmbit_get_level_root(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }

 /** \brief get the block for this key on the current level as a const u8 ptr */
 static really_inline
 const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level,
                                    u32 key) {
-    return mmbit_get_level_root_const(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    const u8 *level_root = mmbit_get_level_root_const(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }

 /** \brief get the _byte_ for this key on the current level as a u8 ptr */
@ -254,7 +254,7 @@ static really_inline
 u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
    u8 *level_root = mmbit_get_level_root(bits, level);
    u32 ks = mmbit_get_ks(max_level, level);
-    return level_root + (key >> (ks + MMB_KEY_SHIFT - 3));
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3));
 }

 /** \brief get our key value for the current level */
@ -721,11 +721,11 @@ u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
 }

 static really_inline
-MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
-                         u32 block_base) {
+MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
+                         u64a block_base) {
    const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
-    u32 lshift = (block_min - block_base) >> level_shift;
-    u32 ushift = (block_max - block_base) >> level_shift;
+    u64a lshift = (block_min - block_base) >> level_shift;
+    u64a ushift = (block_max - block_base) >> level_shift;
    MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0;
    MMB_TYPE umask =
        ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES;
@ -734,7 +734,7 @@ MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,

 static really_inline
 u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
-    u32 key = 0;
+    u64a key = 0;
    u32 ks = mmbit_keyshift(total_bits);
    const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
    u32 level = 0;
@ -743,9 +743,9 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
        assert(level <= max_level);

        u32 block_width = MMB_KEY_BITS << ks;
-        u32 block_base = key*block_width;
-        u32 block_min = MAX(it_start, block_base);
-        u32 block_max = MIN(it_end, block_base + block_width - 1);
+        u64a block_base = key * block_width;
+        u64a block_min = MAX(it_start, block_base);
+        u64a block_max = MIN(it_end, block_base + block_width - 1);
        const u8 *block_ptr =
            mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
        MMB_TYPE block = mmb_load(block_ptr);
@ -761,13 +761,14 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
            // No bit found, go up a level
            // we know that this block didn't have any answers, so we can push
            // our start iterator forward.
-            it_start = block_base + block_width;
-            if (it_start > it_end) {
+            u64a next_start = block_base + block_width;
+            if (next_start > it_end) {
                break;
            }
            if (level-- == 0) {
                break;
            }
+            it_start = next_start;
            key >>= MMB_KEY_SHIFT;
            ks += MMB_KEY_SHIFT;
        }
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@ -128,11 +128,9 @@ vector<ReportID> ReportManager::getDkeyToReportTable() const {
 }

 void ReportManager::assignDkeys(const RoseBuild *rose) {
-    unique_ptr<RoseDedupeAux> dedupe = rose->generateDedupeAux();
-
    DEBUG_PRINTF("assigning...\n");

-    map<u32, set<ReportID>> ext_to_int;
+    map<u32, ue2::flat_set<ReportID>> ext_to_int;

    for (u32 i = 0; i < reportIds.size(); i++) {
        const Report &ir = reportIds[i];
@ -143,6 +141,8 @@ void ReportManager::assignDkeys(const RoseBuild *rose) {
        }
    }

+    auto dedupe = rose->generateDedupeAux();
+
    for (const auto &m : ext_to_int) {
        u32 ext = m.first;

--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@ -7,7 +7,8 @@ if(NOT XCODE)
 else()
    set(CMAKE_CXX_FLAGS "-isystem ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CXX_FLAGS}")
 endif()
-include_directories(${CMAKE_SOURCE_DIR}/util)
+
+include_directories(${PROJECT_SOURCE_DIR})

 # remove some warnings
 # cmake's scope means these only apply here
@ -26,7 +27,7 @@ endif()

 add_library(gtest ${gtest_SOURCES})

-add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${CMAKE_SOURCE_DIR})
+add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})

 if (NOT RELEASE_BUILD)
 set(unit_internal_SOURCES
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@ -85,7 +85,6 @@
 84:/[=\]=]/ #Unsupported POSIX collating element at index 0.
 85:/A(?!)+Z/ #Invalid repeat at index 5.
 86:/\X/ #\X unsupported at index 0.
-87:/[a\Qz\E]/ #\Q..\E sequences in character classes not supported at index 2.
 88:/[A-\d]/ #Invalid range in character class at index 3.
 89:/[A-[:digit:]]/ #Invalid range in character class at index 3.
 90:/B[--[:digit:]--]+/ #Invalid range in character class at index 4.
@ -128,3 +127,8 @@
 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
 129:/bignum \1111111111111111111/ #Number is too big at index 7.
 130:/foo|&{5555555,}/ #Bounded repeat is too large.
+131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
+132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
+133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
+134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
+135:/[^\D\d]/8W #Pattern can never match.
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@ -363,7 +363,9 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
    ASSERT_TRUE(ba != nullptr);

    // Set one bit on and run some checks.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
+        SCOPED_TRACE(i);
+
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);

@ -381,7 +383,12 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {

        // Scanning from one past our bit to the end should find nothing.
        if (i != test_size - 1) {
-            ASSERT_EQ(MMB_INVALID, mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
+            // Ordinary iterator.
+            ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, i));
+
+            // Bounded iterator.
+            ASSERT_EQ(MMB_INVALID,
+                      mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
        }
    }
 }
@ -393,7 +400,7 @@ TEST_P(MultiBitTest, BoundedIteratorAll) {
    // Switch everything on.
    fill_mmbit(ba, test_size);

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        if (i != 0) {
            ASSERT_EQ(0U, mmbit_iterate_bounded(ba, test_size, 0, i));
        }
@ -408,13 +415,13 @@ TEST_P(MultiBitTest, BoundedIteratorEven) {

    // Set every even-numbered bit and see what we can see.
    mmbit_clear(ba, test_size);
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
        mmbit_set(ba, test_size, i);
    }

    u32 even_stride = stride % 2 ? stride + 1 : stride;

-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
        // Scanning from each even bit to the end should find itself.
        ASSERT_EQ(i, mmbit_iterate_bounded(ba, test_size, i, test_size));

@ -439,13 +446,13 @@ TEST_P(MultiBitTest, BoundedIteratorOdd) {

    // Set every odd-numbered bit and see what we can see.
    mmbit_clear(ba, test_size);
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
        mmbit_set(ba, test_size, i);
    }

    u32 even_stride = stride % 2 ? stride + 1 : stride;

-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
        // Scanning from each even bit to the end should find i+1.
        if (i+1 < test_size) {
            ASSERT_EQ(i+1, mmbit_iterate_bounded(ba, test_size, i, test_size));
@ -473,7 +480,7 @@ TEST_P(MultiBitTest, Set) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);

        // set a bit that wasn't set before
@ -500,7 +507,7 @@ TEST_P(MultiBitTest, Iter) {
    mmbit_clear(ba, test_size);
    ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -517,13 +524,13 @@ TEST_P(MultiBitTest, IterAll) {
    ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));

    // Set all bits.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        mmbit_set(ba, test_size, i);
    }

    // Find all bits.
    u32 it = MMB_INVALID;
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        ASSERT_EQ(i, mmbit_iterate(ba, test_size, it));
        it = i;
    }
@ -536,7 +543,7 @@ TEST_P(MultiBitTest, AnyPrecise) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any_precise(ba, test_size));

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -551,7 +558,7 @@ TEST_P(MultiBitTest, Any) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -567,7 +574,7 @@ TEST_P(MultiBitTest, UnsetRange1) {
    fill_mmbit(ba, test_size);

    // Use mmbit_unset_range to switch off any single bit.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_TRUE(mmbit_isset(ba, test_size, i));
        mmbit_unset_range(ba, test_size, i, i + 1);
@ -590,7 +597,7 @@ TEST_P(MultiBitTest, UnsetRange2) {
    // Use mmbit_unset_range to switch off all bits.
    mmbit_unset_range(ba, test_size, 0, test_size);

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_FALSE(mmbit_isset(ba, test_size, i));
    }
@ -601,12 +608,12 @@ TEST_P(MultiBitTest, UnsetRange3) {
    ASSERT_TRUE(ba != nullptr);

    // Use mmbit_unset_range to switch off bits in chunks of 3.
-    for (u32 i = 0; i < test_size - 3; i += stride) {
+    for (u64a i = 0; i < test_size - 3; i += stride) {
        // Switch on the bit before, the bits in question, and the bit after.
        if (i > 0) {
            mmbit_set(ba, test_size, i - 1);
        }
-        for (u32 j = i; j < min(i + 4, test_size); j++) {
+        for (u64a j = i; j < min(i + 4, (u64a)test_size); j++) {
            mmbit_set(ba, test_size, j);
        }

@ -635,7 +642,7 @@ TEST_P(MultiBitTest, InitRangeAll) {
    mmbit_init_range(ba, test_size, 0, test_size);

    // Make sure they're all set.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_TRUE(mmbit_isset(ba, test_size, i));
    }
@ -656,7 +663,7 @@ TEST_P(MultiBitTest, InitRangeOne) {
    SCOPED_TRACE(test_size);
    ASSERT_TRUE(ba != nullptr);

-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        mmbit_init_range(ba, test_size, i, i + 1);

        // Only bit 'i' should be on.
@ -685,7 +692,7 @@ TEST_P(MultiBitTest, InitRangeChunked) {
            ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));

            // All bits in the chunk should be on.
-            for (u32 i = chunk_begin; i < chunk_end; i += stride) {
+            for (u64a i = chunk_begin; i < chunk_end; i += stride) {
                SCOPED_TRACE(i);
                ASSERT_TRUE(mmbit_isset(ba, test_size, i));
            }
@ -985,7 +992,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1032,7 +1039,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
    // Switch every third bits on in state
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += 3) {
+    for (u64a i = 0; i < test_size; i += 3) {
        mmbit_set(ba, test_size, i);
    }

@ -1044,7 +1051,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
    ASSERT_EQ(0U, val);
    ASSERT_EQ(0U, idx);

-    for (u32 i = 0; i < test_size - 3; i += 3) {
+    for (u64a i = 0; i < test_size - 3; i += 3) {
        mmbit_unset(ba, test_size, i);
        val = mmbit_sparse_iter_begin(ba, test_size, &idx, &it[0], &state[0]);
        ASSERT_EQ(i+3, val);
@ -1060,7 +1067,7 @@ TEST_P(MultiBitTest, SparseIteratorNextAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1103,7 +1110,7 @@ TEST_P(MultiBitTest, SparseIteratorNextExactStrided) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
        mmbit_set(ba, test_size, i);
    }
@ -1135,7 +1142,7 @@ TEST_P(MultiBitTest, SparseIteratorNextNone) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1164,7 +1171,7 @@ TEST_P(MultiBitTest, SparseIteratorUnsetAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1194,10 +1201,10 @@ TEST_P(MultiBitTest, SparseIteratorUnsetHalves) {

    // Two sparse iterators: one for even bits, one for odd ones
    vector<u32> even, odd;
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
        even.push_back(i);
    }
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
        odd.push_back(i);
    }

@ -1277,9 +1284,9 @@ static const MultiBitTestParam multibitTests[] = {
    { 1U << 28, 15073 },
    { 1U << 29, 24413 },
    { 1U << 30, 50377 },
+    { 1U << 31, 104729 },

-    // XXX: cases this large segfault in mmbit_set, FIXME NOW
-    //{ 1U << 31, 3701 },
+    // { UINT32_MAX, 104729 }, // Very slow
 };

 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));
--- a/unit/internal/nfagraph_find_matches.cpp
+++ b/unit/internal/nfagraph_find_matches.cpp
@ -36,9 +36,9 @@
 #include "nfagraph/ng_builder.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_asserts.h"
-#include "util/target_info.h"
 #include "hs_compile.h"
-#include "ng_find_matches.h"
+#include "util/ng_find_matches.h"
+#include "util/target_info.h"

 using namespace std;
 using namespace testing;
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@ -448,6 +448,25 @@ TEST_P(RepeatTest, Pack) {
    }
 }

+TEST_P(RepeatTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << info);
+
+    if (info.repeatMax == REPEAT_INF) {
+        return; // Test not valid for FIRST-type repeats.
+    }
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(&info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(&info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        repeatStore(&info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(&info, ctrl, state));
+    }
+}
+
 static
 const u32 sparsePeriods[] = {
    2,
@ -505,6 +524,7 @@ const RepeatTestInfo sparseRepeats[] = {
    { REPEAT_SPARSE_OPTIMAL_P, 4000, 4000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4500, 4500 },
    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5000 },
+    { REPEAT_SPARSE_OPTIMAL_P, 65534, 65534 },
    // {N, M} repeats
    { REPEAT_SPARSE_OPTIMAL_P, 10, 20 },
    { REPEAT_SPARSE_OPTIMAL_P, 20, 40 },
@ -528,7 +548,8 @@ const RepeatTestInfo sparseRepeats[] = {
    { REPEAT_SPARSE_OPTIMAL_P, 3500, 4000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4000, 8000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4500, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 }
+    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 },
+    { REPEAT_SPARSE_OPTIMAL_P, 60000, 65534 }
 };

 static
@ -802,7 +823,7 @@ TEST_P(SparseOptimalTest, Simple1) {
                                  1000 + info->repeatMax * 2));
    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state,
                                  1000 + info->repeatMax * 2 + 1));
-    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 10000));
+    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 100000));
 }

 TEST_P(SparseOptimalTest, TwoTopsNeg) {
@ -893,6 +914,24 @@ TEST_P(SparseOptimalTest, Simple3e) {
    test_sparse3entryExpire(info, ctrl, state, 2 * info->minPeriod - 1);
 }

+TEST_P(SparseOptimalTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        if (top2 - top1 < info->minPeriod) {
+            continue; // not a valid top
+        }
+        repeatStore(info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(info, ctrl, state));
+    }
+}
+
 TEST_P(SparseOptimalTest, ThreeTops) {
    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);