Merge branch develop into master

2025-08-16 07:56:09 +03:00 · 2015-12-18 14:41:50 +11:00 · 2015-12-18 14:41:50 +11:00 · 0e5c4cbd1d
commit 0e5c4cbd1d
parent fe31630221 a5944067d4
72 changed files with 1021 additions and 872 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,36 @@
 # Hyperscan Change Log
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 ## [4.1.0] 2015-12-18
 - Update version of PCRE used by testing tools as a syntax and semantic
  reference to PCRE 8.38.
 - Small updates to fix warnings identified by Coverity.
 - Clean up and unify exception handling behaviour across GPR and SIMD NFA
  models.
 - Fix bug in handling of bounded repeat triggers with large gaps between them
  for sparse repeat model.
 - Correctly reject POSIX collating elements (`[.ch.]`, `[=ch=]`) in the parser.
  These are not supported by Hyperscan.
 - Add support for quoted sequences (`\Q...\E`) inside character classes.
 - Simplify FDR literal matcher runtime by removing some static specialization.
 - Fix handling of the POSIX `[:graph:]`, `[:print:]` and `[:punct:]` character
  classes to match the behaviour of PCRE 8.38 in both standard operation and
  with the UCP flag set. (Note: some bugs were fixed in this area in PCRE
  8.38.) Previously Hyperscan's behaviour was the same as versions of PCRE
  before 8.34.
 - Improve performance when compiling pattern sets that include a large number
  of similar bounded repeat constructs. (github issue #9)
 ## [4.0.1] 2015-10-30
 - Minor cleanups to test code.
 - CMake and other build system improvements.
 - API update: allow `hs_reset_stream()` and `hs_reset_and_copy_stream()` to be
  supplied with a NULL scratch pointer if no matches are required. This is in
  line with the behaviour of `hs_close_stream()`.
 - Disallow bounded repeats with a very large minimum repeat but no maximum,
  i.e. {N,} for very large N.
 - Reduce compile memory usage in literal set explansion for some large cases.
 ## [4.0.0] 2015-10-20
 - Original release of Hyperscan as open-source software.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 2.8.11)
 project (Hyperscan C CXX)
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 0)
+set (HS_MINOR_VERSION 1)
-set (HS_PATCH_VERSION 1)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 INCLUDE (CheckFunctionExists)
@ -56,8 +56,9 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
    set(XCODE TRUE)
 endif()
-include_directories(src .)
+set(CMAKE_INCLUDE_CURRENT_DIR 1)
-include_directories(${CMAKE_BINARY_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)
 set(BOOST_USE_STATIC_LIBS OFF)
@ -71,7 +72,7 @@ find_package(Boost ${BOOST_MINVERSION})
 if(NOT Boost_FOUND)
    # we might have boost in tree, so provide a hint and try again
    message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${CMAKE_SOURCE_DIR}/include")
+    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
    find_package(Boost ${BOOST_MINVERSION})
    if(NOT Boost_FOUND)
        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
@ -219,6 +220,15 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
 if (RELEASE_BUILD)
    if (HAS_C_HIDDEN)
        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
    endif()
    if (HAS_CXX_HIDDEN)
        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
    endif()
 endif()
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@ -327,8 +337,8 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
 endif()
 # do substitutions
-configure_file(${CMAKE_MODULE_PATH}/config.h.in ${CMAKE_BINARY_DIR}/config.h)
+configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
-configure_file(src/hs_version.h.in hs_version.h)
+configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
 if (PKG_CONFIG_FOUND)
    # we really only need to do this if we have pkg-config
@ -345,7 +355,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 # include the autogen targets
 add_subdirectory(src/fdr)
-include_directories(${CMAKE_BINARY_DIR}/src/fdr)
+include_directories(${PROJECT_BINARY_DIR}/src/fdr)
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
--- a/README.md
+++ b/README.md
@ -20,3 +20,24 @@ the [Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/)
 Hyperscan is licensed under the BSD License. See the LICENSE file in the
 project repository.
 # Versioning
 The `master` branch on Github will always contain the most recent release of
 Hyperscan. Each version released to `master` goes through QA and testing before
 it is released; if you're a user, rather than a developer, this is the version
 you should be using.
 Further development towards the next release takes place on the `develop`
 branch.
 # Get Involved
 The official homepage for Hyperscan is at [01.org/hyperscan](https://01.org/hyperscan).
 If you have questions or comments, we encourage you to [join the mailing
 list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
 sending email to the list, or by creating an issue on Github.
 If you wish to contact the Hyperscan team at Intel directly, without posting
 publicly to the mailing list, send email to
 [hyperscan@intel.com](mailto:hyperscan@intel.com).
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -63,6 +63,9 @@ described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 The version of PCRE used to validate Hyperscan's interpretation of this syntax
 is 8.38.
 ====================
 Supported Constructs
 ====================
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@ -109,7 +109,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
     * limit the size of our buffer appropriately. */
    if ((unsigned long)dataLen > UINT_MAX) {
        dataLen = UINT_MAX;
-        printf("WARNING: clipping data to %lu bytes\n", dataLen);
+        printf("WARNING: clipping data to %ld bytes\n", dataLen);
    } else if (dataLen == 0) {
        fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
        fclose(f);
@ -118,7 +118,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
    char *inputData = malloc(dataLen);
    if (!inputData) {
-        fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
+        fprintf(stderr, "ERROR: unable to malloc %ld bytes\n", dataLen);
        fclose(f);
        return NULL;
    }
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@ -27,10 +27,10 @@ fdr_autogen(teddy_runtime teddy_autogen.c)
 fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
 set(fdr_GENERATED_SRC
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
    PARENT_SCOPE)
 set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
 def build_fdr_matchers():
    all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
+    strides = [ 1, 2, 4 ]
    big_domains = [ 14, 15 ]
    common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
+    for s in strides:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+        all_matchers += [ M3(stride = s, **common) ]
        all_matchers += [ M3(stride = 2, domain = d, **common) ]
        all_matchers += [ M3(stride = 4, domain = d, **common) ]
    for d in big_domains:
        all_matchers += [ M3(stride = 1, domain = d, **common) ]
    return all_matchers
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -40,27 +40,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_streaming_runtime.h"
 #include "fdr_loadval.h"
 static really_inline UNUSED
 u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
    u32 r = 0;
    if (a->start_offset == 0) {
        if (numBits <= 8) {
            r = a->buf_history[a->len_history - 1];
        } else {
            r = a->buf_history[a->len_history - 1];
            r |= (a->buf[0] << 8);
        }
    } else {
        if (numBits <= 8) {
            r = a->buf[a->start_offset - 1];
        } else {
            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
        }
    }
    return r & ((1 << numBits) - 1);
 }
 #include "fdr_autogen.c"
 #define FAKE_HISTORY_SIZE 16
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@ -74,7 +74,7 @@ class ValueExtractStep(Step):
        dsb = m.datasize_bytes
        modval = offset % dsb
-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
            # Case 1: reading more than one byte over the end of the bulk load
            self.latency = 4
@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                    temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
        v_var = self.nv(m.value_extract_type, "v%d" % offset)
        self.val = v_var.gen_initializer_stmt(init_string)
@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                          enable_confirmless = m.stride == 1, do_bailout = False)
 class M3(MatcherBase):
    def get_hash_safety_parameters(self):
        h_size = self.single_load_type.size_in_bytes()
        return (0, h_size - 1)
    def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
              self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
              self.arch.target, self.conf_pull_back, self.conf_top_level_split)
    def produce_main_loop(self, switch_variant = False):
@ -351,7 +347,14 @@ class M3(MatcherBase):
        s = Template("""
    $TYPENAME s;
    if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
+        u32 tmp = 0;
        if (a->start_offset == 0) {
            tmp = a->buf_history[a->len_history - 1];
            tmp |= (a->buf[0] << 8);
        } else {
            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
        }
        tmp &= fdr->domainMask;
        s = *((const $TYPENAME *)ft + tmp);
        $SHIFT_EXPR;
    } else {
@ -359,15 +362,13 @@ class M3(MatcherBase):
    }
 """).substitute(TYPENAME = s_type.get_name(),
                ZERO_EXPR = s_type.zero_expression(),
                DOMAIN = self.domain,
                SHIFT_EXPR = shift_expr)
        return s
    def produce_code(self):
-        (behind, ahead) = self.get_hash_safety_parameters()
+        loop_read_behind = 0
-        loop_read_behind = behind
+        loop_read_ahead = self.loop_bytes + 1
        loop_read_ahead = self.loop_bytes + ahead
        # we set up mask and shift stuff for extracting our masks from registers
        #
@ -380,7 +381,7 @@ class M3(MatcherBase):
        ssb = self.state_type.size / 8 # state size in bytes
        # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
            # obscure corner - we don't have the room in the register to
            # do this for all values so we don't. domain==16 is pretty
            # bad anyhow, of course
@ -390,7 +391,6 @@ class M3(MatcherBase):
        shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
        self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
        print self.produce_header(visible = False)
@ -398,21 +398,19 @@ class M3(MatcherBase):
        print " Arch: " + self.arch.name,
        print " State type: " + self.state_type.get_name(),
        print " Num buckets: %d" % self.num_buckets,
        print " Domain: %d" % self.domain,
        print " Stride: %d" % self.stride
        print self.produce_common_declarations()
        print
-        print "\tconst size_t tabSize = %d;" % self.table_size
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
-        print """
+        print
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
+        print "    u64a domain_mask = fdr->domainMask;"
-    const u32 * confBase = (const u32 *)(ft + tabSize);
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
-"""
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
        print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
        print """
    while (ptr < buf + len) {
@ -451,9 +449,9 @@ class M3(MatcherBase):
        print self.produce_footer()
    def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                 arch,
                 table_state_width = None,
                 num_buckets = 8,
@ -474,17 +472,9 @@ class M3(MatcherBase):
        self.table_state_width = state_width
        self.table_state_type = getRequiredType(self.table_state_width)
-        # domain is the number of bits that we draw from our input to
+        # this is the load type required for domain [9:15] if we want to
        # index our 'reach' table
        if not 8 <= domain <= 16:
            fail_out("Unsupported domain: %d" % domain)
        self.domain = domain
        # this is the load type required for this domain if we want to
        # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
+        self.single_load_type = IntegerType(16)
        # table size
        self.table_size = 2**domain * table_state_width // 8
        # stride is the frequency with which we make data-driven
        # accesses to our reach table
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
    ptr += floodControlTmp.second;
    aligned_free(floodControlTmp.first);
    /*  we are allowing domains 9 to 15 only */
    assert(eng.bits > 8 && eng.bits < 16);
    fdr->domain = eng.bits;
    fdr->schemeWidthByte = eng.schemeWidth / 8;
    fdr->domainMask = (1 << eng.bits) - 1;
    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
    if (link.first) {
        fdr->link = verify_u32(ptr - fdr_base);
        memcpy(ptr, link.first, link.second);
@ -245,6 +252,8 @@ void FDRCompiler::assignStringsToBuckets() {
    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
    u32 ls = verify_u32(lits.size());
    assert(ls); // Shouldn't be called with no literals.
    // make a vector that contains our literals as pointers or u32 LiteralIndex values
    vector<LiteralIndex> vli;
    vli.resize(ls);
@ -292,6 +301,8 @@ void FDRCompiler::assignStringsToBuckets() {
            currentChunk++;
        }
    }
    assert(currentChunk > 0);
    count[currentChunk - 1] = ls - chunkStartID;
    // close off chunks with an empty row
    firstIds[currentChunk] = ls;
@ -383,12 +394,14 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                               const vector<hwlmLiteral> &lits,
                               SuffixPositionInString pos,
                               std::map<u32, ue2::unordered_set<u32> > &m2) {
    assert(eng.bits < 32);
    u32 distance = 0;
    if (eng.bits <= 8) {
        distance = 1;
    } else if (eng.bits <= 16) {
        distance = 2;
-    } else if (eng.bits <= 32) {
+    } else {
        distance = 4;
    }
@ -528,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
        return nullptr;
    }
    // temporary hack for unit testing
    if (hint != HINT_INVALID) {
        des->bits = 9;
    }
    FDRCompiler fc(lits, *des, make_small);
    return fc.build(link);
 }
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
        unique_ptr<FDREngineDescription> des =
            getFdrDescription(fdr->engineID);
        if (des) {
            fprintf(f, "    domain     %u\n", des->bits);
            fprintf(f, "    stride     %u\n", des->stride);
            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
            fprintf(f, "    width      %u\n", des->schemeWidth);
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                        def.numBuckets, def.confirmPullBackDistance,
                        def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
    // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@ -105,11 +105,16 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
    DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                 desiredStride);
-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
    u32 best_score = 0;
    for (u32 domain = 9; domain <= 15; domain++) {
        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
+            // to make sure that domains >=14 have stride 1 according to origin
            if (domain > 13 && engineID > 0) {
                continue;
            }
            FDREngineDescription &eng = allDescs[engineID];
            if (!eng.isValidOnTarget(target)) {
                continue;
            }
@ -165,7 +170,7 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
                ideal -= 2;
            }
-        score -= absdiff(ideal, eng.bits);
+            score -= absdiff(ideal, domain);
            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
                         "-> score=%u\n",
@ -173,10 +178,12 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
                         eng.getNumBuckets(), eng.stride, score);
            if (!best || score > best_score) {
                eng.bits = domain;
                best = &eng;
                best_score = score;
            }
        }
    }
    if (!best) {
        DEBUG_PRINTF("failed to find engine\n");
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@ -43,7 +43,6 @@ struct FDREngineDef {
    u32 schemeWidth;
    u32 numBuckets;
    u32 stride;
    u32 bits;
    u64a cpu_features;
    u32 confirmPullBackDistance;
    u32 confirmTopLevelSplit;
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@ -76,9 +76,11 @@ struct FDR {
     * structures (spillover strings and hash table) if we're a secondary
     * structure. */
    u32 link;
    u8 domain; /* dynamic domain info */
    u8 schemeWidthByte;  /* scheme width in bytes */
    u16 domainMask; /* pre-computed domain mask */
    u32 tabSize; /* pre-computed hashtable size in bytes */
    u32 pad1;
    u32 pad2;
    u32 pad3;
    union {
        u32 s_u32;
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@ -58,11 +58,13 @@
 #include <boost/range/adaptor/map.hpp>
 using namespace std;
 using boost::adaptors::map_keys;
 using boost::adaptors::map_values;
 namespace ue2 {
 #define CASTLE_MAX_TOPS 32
 #define CLIQUE_GRAPH_MAX_SIZE 1000
 static
 u32 depth_to_u32(const depth &d) {
@ -106,51 +108,35 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
 }
 static
-size_t literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b) {
+bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
                    const size_t dist) {
    for (size_t i = 0; i < b.size(); i++) {
        if (i > dist) {
            return true;
        }
        size_t overlap_len = b.size() - i;
        if (overlap_len <= a.size()) {
            if (matches(a.end() - overlap_len, a.end(), b.begin(),
                        b.end() - i)) {
-                return i;
+                return false;
            }
        } else {
            assert(overlap_len > a.size());
            if (matches(a.begin(), a.end(), b.end() - i - a.size(),
                        b.end() - i)) {
-                return i;
+                return false;
            }
        }
    }
-    return b.size();
+    return b.size() > dist;
 }
 //  UE-2666 case 1: The problem of find largest exclusive subcastles group
 //  can be reformulated as finding the largest clique (subgraph where every
 //  vertex is connected to every other vertex) in the graph. We use an
 //  approximate algorithm here to find the maximum clique.
 //  References
 //  ----------
 //      [1] Boppana, R., & Halldórsson, M. M. (1992).
 //      Approximating maximum independent sets by excluding subgraphs.
 //      BIT Numerical Mathematics, 32(2), 180–196. Springer.
 //      doi:10.1007/BF01994876
 //  ----------
 struct CliqueVertexProps {
    CliqueVertexProps() {}
    explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
    u32 stateId = ~0U;
    u32 parentId = ~0U;
    bool leftChild = false; /* tells us if it is the left child of its parent */
    bool rightChildVisited = false; /* tells us if its right child is visited */
    vector<u32> clique1; /* clique for the left branch */
    vector<u32> indepSet1; /* independent set for the left branch */
    vector<u32> clique2; /* clique for the right branch */
    vector<u32> indepSet2; /* independent set for the right branch */
 };
 typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
@ -158,181 +144,54 @@ typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
 typedef CliqueGraph::vertex_descriptor CliqueVertex;
 static
-unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
+void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
-    u32 size = exclusiveSet.size();
+                     const CliqueVertex &cv, const set<u32> &group) {
    vector<CliqueVertex> vertices;
    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
    for (u32 i = 0; i < size; ++i) {
        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
        vertices.push_back(v);
    }
    // construct the complement graph, then its maximum independent sets
    // are equal to the maximum clique of the original graph
    for (u32 i = 0; i < size; ++i) {
        CliqueVertex s = vertices[i];
        vector<u32> complement(size, 0);
        for (u32 j = 0; j < exclusiveSet[i].size(); ++j) {
            u32 val = exclusiveSet[i][j];
            complement[val] = 1;
        }
        for (u32 k = i + 1; k < size; ++k) {
             if (!complement[k]) {
                CliqueVertex d = vertices[k];
                add_edge(s, d, *cg);
             }
        }
    }
    return cg;
 }
 static
 CliqueGraph createSubgraph(const CliqueGraph &cg,
                           const vector<CliqueVertex> &vertices) {
    CliqueGraph g;
    map<u32, CliqueVertex> vertexMap;
    for (auto u : vertices) {
        u32 id = cg[u].stateId;
        CliqueVertex v = add_vertex(CliqueVertexProps(id), g);
        vertexMap[id] = v;
    }
    set<u32> found;
    for (auto u : vertices) {
        u32 srcId = cg[u].stateId;
        CliqueVertex src = vertexMap[srcId];
        found.insert(srcId);
        for (auto n : adjacent_vertices_range(u, cg)) {
            u32 dstId = cg[n].stateId;
            if (found.find(dstId) == found.end() &&
                vertexMap.find(dstId) != vertexMap.end()) {
                CliqueVertex dst = vertexMap[dstId];
                add_edge(src, dst, g);
            }
        }
    }
    return g;
 }
 static
 void getNeighborInfo(const CliqueGraph &g, vector<CliqueVertex> &neighbor,
                     vector<CliqueVertex> &nonneighbor,
                     const CliqueVertex &cv) {
    u32 id = g[cv].stateId;
    ue2::unordered_set<u32> neighborId;
    // find neighbors for cv
-    for (auto v : adjacent_vertices_range(cv, g)) {
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
-        neighbor.push_back(v);
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
            neighbor.push_back(g[v].stateId);
            neighborId.insert(g[v].stateId);
-    }
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
    // find non-neighbors for cv
    for (auto v : vertices_range(g)) {
        if (g[v].stateId != id &&
            neighborId.find(g[v].stateId) == neighborId.end()) {
            nonneighbor.push_back(v);
        }
    }
 }
 static
-void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
+void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
-                      vector<u32> &clique, vector<u32> &indepSet) {
+    stack<vector<u32>> gStack;
    u32 id = cg[n].stateId;
    if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) {
        cg[n].clique1.push_back(id);
        clique.swap(cg[n].clique1);
    } else {
        clique.swap(cg[n].clique2);
    }
-    if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) {
+    // Create mapping between vertex and id
        cg[n].indepSet2.push_back(id);
        indepSet.swap(cg[n].indepSet2);
    } else {
        indepSet.swap(cg[n].indepSet1);
    }
 }
 static
 void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique,
                     vector<u32> &indepSet) {
    stack<CliqueGraph> gStack;
    gStack.push(cg);
    // create mapping between vertex and id
    map<u32, CliqueVertex> vertexMap;
-    for (auto v : vertices_range(cg)) {
+    vector<u32> init;
    for (const auto &v : vertices_range(cg)) {
        vertexMap[cg[v].stateId] = v;
        init.push_back(cg[v].stateId);
    }
    gStack.push(init);
-    // get the vertex to start from
+    // Get the vertex to start from
    ue2::unordered_set<u32> foundVertexId;
    CliqueGraph::vertex_iterator vi, ve;
    tie(vi, ve) = vertices(cg);
    CliqueVertex start = *vi;
    u32 startId = cg[start].stateId;
    bool leftChild = false;
    u32 prevId = startId;
    while (!gStack.empty()) {
-        CliqueGraph g = gStack.top();
+        vector<u32> g = gStack.top();
        gStack.pop();
-        // choose a vertex from the graph
+        // Choose a vertex from the graph
-        tie(vi, ve) = vertices(g);
+        u32 id = g[0];
-        CliqueVertex cv = *vi;
+        const CliqueVertex &n = vertexMap.at(id);
-        u32 id = g[cv].stateId;
+        clique.push_back(id);
-
+        // Corresponding vertex in the original graph
-        // corresponding vertex in the original graph
+        vector<u32> neighbor;
-        CliqueVertex n = vertexMap.at(id);
+        set<u32> subgraphId(g.begin(), g.end());
-
+        getNeighborInfo(cg, neighbor, n, subgraphId);
-        vector<CliqueVertex> neighbor;
+        // Get graph consisting of neighbors for left branch
        vector<CliqueVertex> nonneighbor;
        getNeighborInfo(g, neighbor, nonneighbor, cv);
        if (foundVertexId.find(id) != foundVertexId.end()) {
            prevId = id;
            // get graph consisting of non-neighbors for right branch
            if (!cg[n].rightChildVisited) {
                gStack.push(g);
                if (!nonneighbor.empty()) {
                    const CliqueGraph &nSub = createSubgraph(g, nonneighbor);
                    gStack.push(nSub);
                    leftChild = false;
                }
                cg[n].rightChildVisited = true;
            } else if (id != startId) {
                // both the left and right branches are visited,
                // update its parent's clique and independent sets
                u32 parentId = cg[n].parentId;
                CliqueVertex parent = vertexMap.at(parentId);
                if (cg[n].leftChild) {
                    updateCliqueInfo(cg, n, cg[parent].clique1,
                        cg[parent].indepSet1);
                } else {
                    updateCliqueInfo(cg, n, cg[parent].clique2,
                        cg[parent].indepSet2);
                }
            }
        } else {
            foundVertexId.insert(id);
            g[n].leftChild = leftChild;
            g[n].parentId = prevId;
            gStack.push(g);
            // get graph consisting of neighbors for left branch
        if (!neighbor.empty()) {
-                const CliqueGraph &sub = createSubgraph(g, neighbor);
+            gStack.push(neighbor);
                gStack.push(sub);
                leftChild = true;
            }
            prevId = id;
        }
    }
    updateCliqueInfo(cg, start, clique, indepSet);
 }
 template<typename Graph>
@ -345,18 +204,17 @@ bool graph_empty(const Graph &g) {
 static
 vector<u32> removeClique(CliqueGraph &cg) {
    vector<vector<u32>> cliquesVec(1);
    vector<vector<u32>> indepSetsVec(1);
    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
-    findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]);
+    findCliqueGroup(cg, cliquesVec[0]);
    while (!graph_empty(cg)) {
        const vector<u32> &c = cliquesVec.back();
        vector<CliqueVertex> dead;
-        for (auto v : vertices_range(cg)) {
+        for (const auto &v : vertices_range(cg)) {
            if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
                dead.push_back(v);
            }
        }
-        for (auto v : dead) {
+        for (const auto &v : dead) {
            clear_vertex(v, cg);
            remove_vertex(v, cg);
        }
@ -364,30 +222,22 @@ vector<u32> removeClique(CliqueGraph &cg) {
            break;
        }
        vector<u32> clique;
-        vector<u32> indepSet;
+        findCliqueGroup(cg, clique);
        findCliqueGroup(cg, clique, indepSet);
        cliquesVec.push_back(clique);
        indepSetsVec.push_back(indepSet);
    }
    // get the independent set with max size
    size_t max = 0;
    size_t id = 0;
-    for (size_t j = 0; j < indepSetsVec.size(); ++j) {
+    for (size_t j = 0; j < cliquesVec.size(); ++j) {
-        if (indepSetsVec[j].size() > max) {
+        if (cliquesVec[j].size() > max) {
-            max = indepSetsVec[j].size();
+            max = cliquesVec[j].size();
            id = j;
        }
    }
-    DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size());
+    DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
-    return indepSetsVec[id];
+    return cliquesVec[id];
 }
 static
 vector<u32> findMaxClique(const vector<vector<u32>> &exclusiveSet) {
    auto cg = makeCG(exclusiveSet);
    return removeClique(*cg);
 }
 // if the location of any reset character in one literal are after
@ -401,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2,
    const auto &triggers2 = triggers[id2];
    for (u32 i = 0; i < triggers1.size(); ++i) {
        for (u32 j = 0; j < triggers2.size(); ++j) {
-            size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]);
+            if (!literalOverlap(triggers1[i], triggers2[j],
-            size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]);
+                                min_reset_dist[id2][j]) ||
-            if (max_overlap1 <= min_reset_dist[id2][j] ||
+                !literalOverlap(triggers2[j], triggers1[i],
-                max_overlap2 <= min_reset_dist[id1][i]) {
+                                min_reset_dist[id1][i])) {
                return false;
            }
        }
@ -427,21 +277,26 @@ vector<u32> checkExclusion(const CharReach &cr,
        min_reset_dist.push_back(tmp_dist);
    }
-    vector<vector<u32>> exclusiveSet;
+    vector<CliqueVertex> vertices;
    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
    for (u32 i = 0; i < triggers.size(); ++i) {
        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
        vertices.push_back(v);
    }
    // find exclusive pair for each repeat
    for (u32 i = 0; i < triggers.size(); ++i) {
-        vector<u32> repeatIds;
+        CliqueVertex s = vertices[i];
        for (u32 j = i + 1; j < triggers.size(); ++j) {
            if (findExclusivePair(i, j, min_reset_dist, triggers)) {
-                repeatIds.push_back(j);
+                CliqueVertex d = vertices[j];
                add_edge(s, d, *cg);
            }
        }
        exclusiveSet.push_back(repeatIds);
        DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size());
    }
    // find the largest exclusive group
-    return findMaxClique(exclusiveSet);
+    return removeClique(*cg);
 }
 static
@ -599,7 +454,7 @@ buildCastle(const CastleProto &proto,
        repeatInfoPair.push_back(make_pair(min_period, is_reset));
-        if (is_reset) {
+        if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
            candidateTriggers.push_back(triggers.at(top));
            candidateRepeats.push_back(i);
        }
@ -608,7 +463,7 @@ buildCastle(const CastleProto &proto,
    // Case 1: exclusive repeats
    bool exclusive = false;
    bool pureExclusive = false;
-    u8 activeIdxSize = 0;
+    u32 activeIdxSize = 0;
    set<u32> exclusiveGroup;
    if (cc.grey.castleExclusive) {
        vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
@ -617,7 +472,7 @@ buildCastle(const CastleProto &proto,
            // Case 1: mutual exclusive repeats group found, initialize state
            // sizes
            exclusive = true;
-            activeIdxSize = calcPackedBytes(exclusiveSize);
+            activeIdxSize = calcPackedBytes(numRepeats + 1);
            if (exclusiveSize == numRepeats) {
                pureExclusive = true;
                streamStateSize = 0;
@ -665,7 +520,7 @@ buildCastle(const CastleProto &proto,
    c->numRepeats = verify_u32(subs.size());
    c->exclusive = exclusive;
    c->pureExclusive = pureExclusive;
-    c->activeIdxSize = activeIdxSize;
+    c->activeIdxSize = verify_u8(activeIdxSize);
    writeCastleScanEngine(cr, c);
@ -710,8 +565,8 @@ buildCastle(const CastleProto &proto,
 set<ReportID> all_reports(const CastleProto &proto) {
    set<ReportID> reports;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
+    for (const ReportID &report : proto.report_map | map_keys) {
-        reports.insert(pr.reports.begin(), pr.reports.end());
+        reports.insert(report);
    }
    return reports;
 }
@ -732,10 +587,30 @@ depth findMaxWidth(const CastleProto &proto) {
    return max_width;
 }
 depth findMinWidth(const CastleProto &proto, u32 top) {
    if (!contains(proto.repeats, top)) {
        assert(0); // should not happen
        return depth::infinity();
    }
    return proto.repeats.at(top).bounds.min;
 }
 depth findMaxWidth(const CastleProto &proto, u32 top) {
    if (!contains(proto.repeats, top)) {
        assert(0); // should not happen
        return depth(0);
    }
    return proto.repeats.at(top).bounds.max;
 }
 CastleProto::CastleProto(const PureRepeat &pr) {
    assert(pr.reach.any());
    assert(pr.reports.size() == 1);
-    repeats.insert(make_pair(0, pr));
+    u32 top = 0;
    repeats.emplace(top, pr);
    for (const auto &report : pr.reports) {
        report_map[report].insert(top);
    }
 }
 const CharReach &CastleProto::reach() const {
@ -743,25 +618,29 @@ const CharReach &CastleProto::reach() const {
    return repeats.begin()->second.reach;
 }
 static
 u32 find_next_top(const map<u32, PureRepeat> &repeats) {
    u32 top = 0;
    for (; contains(repeats, top); top++) {
        // pass
    }
    return top;
 }
 u32 CastleProto::add(const PureRepeat &pr) {
    assert(repeats.size() < max_occupancy);
    assert(pr.reach == reach());
    assert(pr.reports.size() == 1);
-    u32 top = find_next_top(repeats);
+    u32 top = next_top++;
    DEBUG_PRINTF("selected unused top %u\n", top);
-    repeats.insert(make_pair(top, pr));
+    assert(!contains(repeats, top));
    repeats.emplace(top, pr);
    for (const auto &report : pr.reports) {
        report_map[report].insert(top);
    }
    return top;
 }
 void CastleProto::erase(u32 top) {
    DEBUG_PRINTF("erase top %u\n", top);
    assert(contains(repeats, top));
    repeats.erase(top);
    for (auto &m : report_map) {
        m.second.erase(top);
    }
 }
 u32 CastleProto::merge(const PureRepeat &pr) {
    assert(repeats.size() <= max_occupancy);
    assert(pr.reach == reach());
@ -806,8 +685,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = find_next_top(c1.repeats);
+        u32 new_top = c1.add(pr);
        c1.repeats.insert(make_pair(new_top, pr));
        top_map[top] = new_top;
        DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
    }
@ -823,12 +701,23 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
    for (const auto &m : proto.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
-        u32 new_top = find_next_top(out);
+        u32 new_top = out.size();
-        out.insert(make_pair(new_top, pr));
+        out.emplace(new_top, pr);
        top_map[top] = new_top;
    }
    proto.repeats.swap(out);
    // Remap report map.
    proto.report_map.clear();
    for (const auto &m : proto.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        for (const auto &report : pr.reports) {
            proto.report_map[report].insert(top);
        }
    }
    assert(proto.repeats.size() <= proto.max_occupancy);
 }
@ -904,19 +793,18 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
    return c1.repeats == c2.repeats;
 }
-bool requiresDedupe(const CastleProto &proto, const set<ReportID> &reports) {
+bool requiresDedupe(const CastleProto &proto,
-    ue2::unordered_set<ReportID> seen;
+                    const ue2::flat_set<ReportID> &reports) {
-    for (const PureRepeat &pr : proto.repeats | map_values) {
+    for (const auto &report : reports) {
-        for (const ReportID &report : pr.reports) {
+        auto it = proto.report_map.find(report);
-            if (contains(reports, report)) {
+        if (it == end(proto.report_map)) {
-                if (contains(seen, report)) {
+            continue;
        }
        if (it->second.size() > 1) {
            DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
                         report);
            return true;
        }
                seen.insert(report);
            }
        }
    }
    return false;
 }
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@ -38,6 +38,7 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/alloc.h"
 #include "util/depth.h"
 #include "util/ue2_containers.h"
 #include <map>
 #include <memory>
@ -67,8 +68,12 @@ struct CastleProto {
    explicit CastleProto(const PureRepeat &pr);
    const CharReach &reach() const;
    /** \brief Add a new repeat. */
    u32 add(const PureRepeat &pr);
    /** \brief Remove a repeat. */
    void erase(u32 top);
    /**
     * \brief Merge in the given repeat, returning the top used.
     *
@ -80,11 +85,22 @@ struct CastleProto {
    /** \brief Mapping from unique top id to repeat. */
    std::map<u32, PureRepeat> repeats;
    /** \brief Mapping from report to associated tops. */
    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
    /**
     * \brief Next top id to use. Repeats may be removed without top remapping,
     * so we track this explicitly instead of using repeats.size().
     */
    u32 next_top = 1;
 };
 std::set<ReportID> all_reports(const CastleProto &proto);
 depth findMinWidth(const CastleProto &proto);
 depth findMaxWidth(const CastleProto &proto);
 depth findMinWidth(const CastleProto &proto, u32 top);
 depth findMaxWidth(const CastleProto &proto, u32 top);
 /**
 * \brief Remap tops to be contiguous.
@ -133,7 +149,8 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
 * \brief True if the given castle contains more than a single instance of any
 * of the reports in the given set.
 */
-bool requiresDedupe(const CastleProto &proto, const std::set<ReportID> &reports);
+bool requiresDedupe(const CastleProto &proto,
                    const ue2::flat_set<ReportID> &reports);
 /**
 * \brief Build an NGHolder from a CastleProto.
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@ -1136,16 +1136,11 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    gough_dfa->length = gough_size;
    /* copy in blocks */
-    memcpy((u8 *)gough_dfa.get() + edge_prog_offset, &edge_blocks[0],
+    copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
           byte_length(edge_blocks));
    if (top_prog_offset) {
-        memcpy((u8 *)gough_dfa.get() + top_prog_offset, &top_blocks[0],
+        copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
               byte_length(top_blocks));
    }
    if (!temp_blocks.empty()) {
        memcpy((u8 *)gough_dfa.get() + prog_base_offset, &temp_blocks[0],
               byte_length(temp_blocks));
    }
    copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);
    return gough_dfa;
 }
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@ -70,8 +70,11 @@ struct dstate_som {
 };
 struct raw_som_dfa : public raw_dfa {
-    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in)
+    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in, u32 trigger,
-        : raw_dfa(k), unordered_som_triggers(unordered_som_triggers_in) {
+                u32 stream_som_loc_width_in)
        : raw_dfa(k), stream_som_loc_width(stream_som_loc_width_in),
        unordered_som_triggers(unordered_som_triggers_in),
        trigger_nfa_state(trigger) {
        assert(!unordered_som_triggers || is_triggered(kind));
    }
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -1397,8 +1397,7 @@ struct Factory {
            repeat->horizon = rsi.horizon;
            repeat->packedCtrlSize = rsi.packedCtrlSize;
            repeat->stateSize = rsi.stateSize;
-            memcpy(repeat->packedFieldSizes, rsi.packedFieldSizes.data(),
+            copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
                   byte_length(rsi.packedFieldSizes));
            repeat->patchCount = rsi.patchCount;
            repeat->patchSize = rsi.patchSize;
            repeat->encodingSize = rsi.encodingSize;
@ -1413,8 +1412,7 @@ struct Factory {
            // Copy in the sparse lookup table.
            if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
                assert(!rsi.table.empty());
-                memcpy(info_ptr + tableOffset, rsi.table.data(),
+                copy_bytes(info_ptr + tableOffset, rsi.table);
                       byte_length(rsi.table));
            }
            // Fill the tug mask.
@ -1702,6 +1700,7 @@ struct Factory {
        for (u32 i = 0; i < num_repeats; i++) {
            repeatOffsets[i] = offset;
            assert(repeats[i].first);
            memcpy((char *)limex + offset, repeats[i].first.get(),
                   repeats[i].second);
            offset += repeats[i].second;
@ -1709,8 +1708,7 @@ struct Factory {
        // Write repeat offset lookup table.
        assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
-        memcpy((char *)limex + repeatOffsetsOffset, repeatOffsets.data(),
+        copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
               byte_length(repeatOffsets));
        limex->repeatOffset = repeatOffsetsOffset;
        limex->repeatCount = num_repeats;
@ -1725,8 +1723,7 @@ struct Factory {
        limex->exReportOffset = exceptionReportsOffset;
        assert(ISALIGNED_N((char *)limex + exceptionReportsOffset,
                           alignof(ReportID)));
-        memcpy((char *)limex + exceptionReportsOffset, reports.data(),
+        copy_bytes((char *)limex + exceptionReportsOffset, reports);
               byte_length(reports));
    }
    static
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@ -317,7 +317,7 @@ template<typename limex_type>
 struct limex_labeller : public nfa_labeller {
    explicit limex_labeller(const limex_type *limex_in) : limex(limex_in) {}
-    void label_state(FILE *f, u32 state) const {
+    void label_state(FILE *f, u32 state) const override {
        const typename limex_traits<limex_type>::exception_type *exceptions
            = getExceptionTable(limex);
        if (!testbit((const u8 *)&limex->exceptionMask,
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@ -218,7 +218,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
    if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
        DEBUG_PRINTF("using cached succ from previous state\n");
        STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), LOAD_STATE(&ctx->cached_esucc)));
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
            DEBUG_PRINTF("firing cached reports from previous state\n");
            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                         ctx->context, offset)
                        == MO_HALT_MATCHING)) {
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@ -83,7 +83,8 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
    if (estate == ctx->cached_estate) {
        DEBUG_PRINTF("using cached succ from previous state\n");
        *succ |= ctx->cached_esucc;
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
            DEBUG_PRINTF("firing cached reports from previous state\n");
            if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                         ctx->context, offset)
                        == MO_HALT_MATCHING)) {
@ -119,8 +120,10 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
        ctx->cached_reports = new_cache.reports;
        ctx->cached_br = new_cache.br;
    } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
        if (ctx->cached_br) {
            ctx->cached_estate = 0U;
        }
    }
    return 0;
 }
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@ -179,7 +179,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    assert(ISALIGNED_CL(ctx));
    assert(ISALIGNED_CL(&ctx->s));
    STATE_T s = LOAD_STATE(&ctx->s);
    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
    /* assert(ISALIGNED_16(exceptions)); */
    /* assert(ISALIGNED_16(reach)); */
@ -305,7 +304,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
    const ReportID *exReports = getExReports(limex);
    const u32 *exceptionMap = limex->exceptionMap;
    STATE_T s = LOAD_STATE(&ctx->s);
    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
    /* assert(ISALIGNED_16(exceptions)); */
    /* assert(ISALIGNED_16(reach)); */
@ -542,7 +540,6 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
    ctx->callback = q->cb;
    ctx->context = q->context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
    assert(q->items[q->cur].location >= 0);
    DEBUG_PRINTF("LOAD STATE\n");
@ -638,7 +635,6 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
    ctx->callback = q->cb;
    ctx->context = q->context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
    DEBUG_PRINTF("LOAD STATE\n");
    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@ -730,7 +726,6 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
    ctx->callback = NULL;
    ctx->context = NULL;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
    DEBUG_PRINTF("LOAD STATE\n");
    STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@ -833,7 +828,6 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
    ctx->callback = cb;
    ctx->context = context;
    STORE_STATE(&ctx->cached_estate, ZERO_STATE);
    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
    const IMPL_NFA_T *limex = getImplNfa(n);
    STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@ -700,7 +700,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
    ReportID arb;
    u8 single;
    u32 accelCount;
    u8 alphaShift = info.getAlphaShift();
    assert(alphaShift <= 8);
    u16 count_real_states;
    if (allocateFSN16(info, &count_real_states)) {
        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
@ -843,6 +846,7 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
                       const vector<u32> &reports_eod, u32 i) {
    dstate_id_t j = info.implId(i);
    u8 alphaShift = info.getAlphaShift();
    assert(alphaShift <= 8);
    for (size_t s = 0; s < info.impl_alpha_size; s++) {
        dstate_id_t raw_succ = info.states[i].next[s];
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@ -70,9 +70,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
        break;
    case MPV_VERM:
        if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "verm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "verm 0x%02x\n", k->u.verm.c);
        } else {
-            fprintf(f, "verm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "verm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
        }
        break;
    case MPV_SHUFTI:
@ -87,9 +87,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
        break;
    case MPV_NVERM:
        if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "nverm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "nverm 0x%02x\n", k->u.verm.c);
        } else {
-            fprintf(f, "nverm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "nverm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
        }
        break;
    default:
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@ -196,6 +196,14 @@ static really_inline s64a q_cur_loc(const struct mq *q) {
    return q->items[q->cur].location;
 }
 /** \brief Returns the type of the last event in the queue. */
 static really_inline u32 q_last_type(const struct mq *q) {
    assert(q->cur < q->end);
    assert(q->end > 0);
    assert(q->end <= MAX_MQE_LEN);
    return q->items[q->end - 1].type;
 }
 /** \brief Returns the location (relative to the beginning of the current data
 * buffer) of the last event in the queue. */
 static really_inline s64a q_last_loc(const struct mq *q) {
@ -269,7 +277,7 @@ void debugQueue(const struct mq *q) {
            type = "MQE_TOP_N";
            break;
        }
-        DEBUG_PRINTF("\tq[%u] %lld %d:%s\n", cur, q->items[cur].location,
+        DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location,
                     q->items[cur].type, type);
    }
 }
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@ -39,6 +39,8 @@
 #include "util/pack_bits.h"
 #include "util/partial_store.h"
 #include "util/unaligned.h"
 #include <stdint.h>
 #include <string.h>
 /** \brief Returns the total capacity of the ring.
@ -709,12 +711,7 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
    dumpRing(info, xs, ring);
 #endif
-    // We work in terms of the distance between the current offset and the base
+    if (offset - xs->offset < info->repeatMin) {
    // offset in our history.
    u64a delta = offset - xs->offset;
    DEBUG_PRINTF("delta=%llu\n", delta);
    if (delta < info->repeatMin) {
        DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
        return REPEAT_NOMATCH;
    }
@ -724,17 +721,22 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
        return REPEAT_STALE;
    }
    // If we're not stale, delta fits in the range [repeatMin, lastTop +
    // repeatMax], which fits in a u32.
    assert(offset - xs->offset < UINT32_MAX);
    u32 delta = (u32)(offset - xs->offset);
    DEBUG_PRINTF("delta=%u\n", delta);
    // Find the bounds on possible matches in the ring buffer.
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin + 1;
+    u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize));
    upper = MIN(upper, ringOccupancy(xs, ringSize));
    if (lower >= upper) {
        DEBUG_PRINTF("no matches to check\n");
        return REPEAT_NOMATCH;
    }
-    DEBUG_PRINTF("possible match indices=[%llu,%llu]\n", lower, upper);
+    DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper);
    if (ringHasMatch(xs, ring, ringSize, lower, upper)) {
        return REPEAT_MATCH;
    }
@ -1163,7 +1165,7 @@ static
 void storeInitialRingTopPatch(const struct RepeatInfo *info,
                              struct RepeatRingControl *xs,
                              u8 *state, u64a offset) {
-    DEBUG_PRINTF("set the first patch\n");
+    DEBUG_PRINTF("set the first patch, offset=%llu\n", offset);
    xs->offset = offset;
    u8 *active = state;
@ -1197,12 +1199,10 @@ u32 getSparseOptimalTargetValue(const struct RepeatInfo *info,
    return loc;
 }
-u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+static
-                                 const union RepeatControl *ctrl,
+u64a sparseLastTop(const struct RepeatInfo *info,
-                                 const void *state) {
+                   const struct RepeatRingControl *xs, const u8 *state) {
    DEBUG_PRINTF("looking for last top\n");
    const struct RepeatRingControl *xs = &ctrl->ring;
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 encoding_size = info->encodingSize;
@ -1214,7 +1214,7 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
    }
    DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ);
-    const u8 *ring = (const u8 *)state + info->patchesOffset;
+    const u8 *ring = state + info->patchesOffset;
    u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size);
    DEBUG_PRINTF("val:%llu\n", val);
@ -1231,6 +1231,12 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
    return 0;
 }
 u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
                                 const union RepeatControl *ctrl,
                                 const void *state) {
    return sparseLastTop(info, &ctrl->ring, state);
 }
 u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
                                   const union RepeatControl *ctrl,
                                   const void *state, u64a offset) {
@ -1249,13 +1255,13 @@ u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
    if (nextOffset <= xs->offset + info->repeatMin) {
        patch = xs->first;
        tval = 0;
-    } else if (nextOffset >
+    } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) {
-               repeatLastTopSparseOptimalP(info, ctrl, state) +
+        DEBUG_PRINTF("ring is stale\n");
               info->repeatMax) {
        return 0;
    } else {
-        u64a delta = nextOffset - xs->offset;
+        assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale
-        u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+        u32 delta = (u32)(nextOffset - xs->offset);
        u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
        patch = lower / patch_size;
        tval = lower - patch * patch_size;
    }
@ -1336,21 +1342,32 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                               union RepeatControl *ctrl, void *state,
                               u64a offset, char is_alive) {
    struct RepeatRingControl *xs = &ctrl->ring;
    u64a delta = offset - xs->offset;
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 encoding_size = info->encodingSize;
    u32 patch = delta / patch_size;
    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset, encoding_size);
    u8 *active = (u8 *)state;
-    if (!is_alive) {
+
    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset,
                 info->encodingSize);
    // If (a) this is the first top, or (b) the ring is stale, initialize the
    // ring and write this offset in as the first top.
    if (!is_alive ||
        offset > sparseLastTop(info, xs, state) + info->repeatMax) {
        storeInitialRingTopPatch(info, xs, active, offset);
        return;
    }
-    assert(offset >= xs->offset);
+    // Tops should arrive in order, with no duplicates.
    assert(offset > sparseLastTop(info, xs, state));
    // As the ring is not stale, our delta should fit within a u32.
    assert(offset - xs->offset <= UINT32_MAX);
    u32 delta = (u32)(offset - xs->offset);
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 encoding_size = info->encodingSize;
    u32 patch = delta / patch_size;
    DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size,
                 patch);
    u8 *ring = active + info->patchesOffset;
    u32 occ = ringOccupancy(xs, patch_count);
@ -1361,10 +1378,6 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                 patch, patch_count, occ);
    if (patch >= patch_count) {
        u32 patch_shift_count = patch - patch_count + 1;
        if (patch_shift_count >= patch_count) {
            storeInitialRingTopPatch(info, xs, active, offset);
            return;
        }
        assert(patch >= patch_shift_count);
        DEBUG_PRINTF("shifting by %u\n", patch_shift_count);
        xs->offset += patch_size * patch_shift_count;
@ -1401,7 +1414,8 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
        }
    }
-    u64a diff = delta - patch * patch_size;
+    assert((u64a)patch * patch_size <= delta);
    u32 diff = delta - patch * patch_size;
    const u64a *repeatTable = getImplTable(info);
    val += repeatTable[diff];
@ -1492,21 +1506,25 @@ enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
    if (offset < xs->offset + info->repeatMin) {
        DEBUG_PRINTF("too soon\n");
        return REPEAT_NOMATCH;
-    } else if (offset > repeatLastTopSparseOptimalP(info, ctrl, state) +
+    } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) {
                        info->repeatMax) {
        DEBUG_PRINTF("stale\n");
        return REPEAT_STALE;
    }
-    u64a delta = offset - xs->offset;
+    // Our delta between the base offset of the ring and the current offset
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This
-    u64a upper = delta - info->repeatMin;
+    // range fits comfortably within a u32.
    assert(offset - xs->offset <= UINT32_MAX);
    u32 delta = (u32)(offset - xs->offset);
    u32 patch_size = info->patchSize;
    u32 patch_count = info->patchCount;
    u32 occ = ringOccupancy(xs, patch_count);
    upper = MIN(upper, occ * patch_size - 1);
-    DEBUG_PRINTF("lower=%llu, upper=%llu\n", lower, upper);
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
    u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1);
    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
    u32 patch_lower = lower / patch_size;
    u32 patch_upper = upper / patch_size;
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@ -75,7 +75,7 @@ u32 calcPackedBytes(u64a val) {
 }
 static
-u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
+u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
                     const u32 minPeriod) {
    u32 repeatTmp = info->patchCount > 2 ? 64 : (u32)repeatMax;
    u32 repeat_index = repeatTmp < minPeriod ? repeatTmp : minPeriod;
@ -93,7 +93,7 @@ u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
 static
 u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
-                         const u32 minPeriod, u64a rv) {
+                         const u32 minPeriod, u32 rv) {
    u32 cnt = 0;
    u32 patch_bits = 0;
    u32 total_size = 0;
@ -171,7 +171,7 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
        assert(minPeriod);
        assert(repeatMax.is_finite());
        {
-            u64a rv = repeatRecurTable(this, repeatMax, minPeriod);
+            u32 rv = repeatRecurTable(this, repeatMax, minPeriod);
            u32 repeatTmp = 0;
            if ((u32)repeatMax < minPeriod) {
                repeatTmp = repeatMax;
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@ -64,7 +64,7 @@ public:
              bool prefilter, const som_type som, ReportID rid, u64a min_offset,
              u64a max_offset, u64a min_length);
-    ~NGWrapper();
+    ~NGWrapper() override;
    /** index of the expression represented by this graph, used
     * - down the track in error handling
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@ -55,14 +55,14 @@ namespace ue2 {
 namespace {
 /** Distance value used to indicate that the vertex can't be reached. */
-static const int DIST_UNREACHABLE = INT_MAX;
+static constexpr int DIST_UNREACHABLE = INT_MAX;
 /**
 * Distance value used to indicate that the distance to a vertex is infinite
 * (for example, it's the max distance and there's a cycle in the path) or so
 * large that we should consider it effectively infinite.
 */
-static const int DIST_INFINITY = INT_MAX - 1;
+static constexpr int DIST_INFINITY = INT_MAX - 1;
 //
 // Filters
@ -71,10 +71,12 @@ static const int DIST_INFINITY = INT_MAX - 1;
 template <class GraphT>
 struct NodeFilter {
    typedef typename GraphT::edge_descriptor EdgeT;
-    NodeFilter() { }
+    NodeFilter() {} // BGL filters must be default-constructible.
    NodeFilter(const vector<bool> *bad_in, const GraphT *g_in)
        : bad(bad_in), g(g_in) { }
    bool operator()(const EdgeT &e) const {
        assert(g && bad);
        u32 src_idx = (*g)[source(e, *g)].index;
        u32 tar_idx = (*g)[target(e, *g)].index;
@ -84,16 +86,20 @@ struct NodeFilter {
        return !(*bad)[src_idx] && !(*bad)[tar_idx];
    }
-    const vector<bool> *bad;
+
-    const GraphT *g;
+private:
    const vector<bool> *bad = nullptr;
    const GraphT *g = nullptr;
 };
 template <class GraphT>
 struct StartFilter {
    typedef typename GraphT::edge_descriptor EdgeT;
-    StartFilter() { }
+    StartFilter() {} // BGL filters must be default-constructible.
    explicit StartFilter(const GraphT *g_in) : g(g_in) { }
    bool operator()(const EdgeT &e) const {
        assert(g);
        u32 src_idx = (*g)[source(e, *g)].index;
        u32 tar_idx = (*g)[target(e, *g)].index;
@ -107,7 +113,9 @@ struct StartFilter {
        }
        return true;
    }
-    const GraphT *g;
+
 private:
    const GraphT *g = nullptr;
 };
 } // namespace
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@ -125,61 +125,62 @@ void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
 }
 static
-void fillStateBitset(const NGHolder &g, const set<NFAVertex> &in,
+dynamic_bitset<> makeStateBitset(const NGHolder &g,
-                     dynamic_bitset<> &out) {
+                                 const flat_set<NFAVertex> &in) {
-    out.reset();
+    dynamic_bitset<> work_states(num_vertices(g));
-    for (auto v : in) {
+    for (const auto &v : in) {
        u32 idx = g[v].index;
-        out.set(idx);
+        work_states.set(idx);
    }
    return work_states;
 }
 static
-void fillVertexSet(const dynamic_bitset<> &in,
+flat_set<NFAVertex> getVertices(const dynamic_bitset<> &in,
-                   const vector<StateInfo> &info, set<NFAVertex> &out) {
+                                const vector<StateInfo> &info) {
-    out.clear();
+    flat_set<NFAVertex> out;
    for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
        out.insert(info[i].vertex);
    }
    return out;
 }
 static
-void fillInfoTable(const NGHolder &g, vector<StateInfo> &info) {
+vector<StateInfo> makeInfoTable(const NGHolder &g) {
-    info.resize(num_vertices(g));
+    vector<StateInfo> info(num_vertices(g));
    for (auto v : vertices_range(g)) {
        u32 idx = g[v].index;
        const CharReach &cr = g[v].char_reach;
        assert(idx < info.size());
        info[idx] = StateInfo(v, cr);
    }
    return info;
 }
-void execute_graph(const NGHolder &g, const ue2_literal &input,
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
-                   set<NFAVertex> *states, bool kill_sds) {
+                                  const flat_set<NFAVertex> &initial_states,
                                  bool kill_sds) {
    assert(hasCorrectlyNumberedVertices(g));
-    vector<StateInfo> info;
+    auto info = makeInfoTable(g);
-    fillInfoTable(g, info);
+    auto work_states = makeStateBitset(g, initial_states);
    dynamic_bitset<> work_states(num_vertices(g));
    fillStateBitset(g, *states, work_states);
    execute_graph_i(g, info, input, &work_states, kill_sds);
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
-void execute_graph(const NGHolder &g, const vector<CharReach> &input,
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
-                   set<NFAVertex> *states) {
+                                  const vector<CharReach> &input,
                                  const flat_set<NFAVertex> &initial_states) {
    assert(hasCorrectlyNumberedVertices(g));
-    vector<StateInfo> info;
+    auto info = makeInfoTable(g);
-    fillInfoTable(g, info);
+    auto work_states = makeStateBitset(g, initial_states);
    dynamic_bitset<> work_states(num_vertices(g));
    fillStateBitset(g, *states, work_states);
    execute_graph_i(g, info, input, &work_states, false);
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
 typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
@ -276,9 +277,10 @@ private:
 };
 } // namespace
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
-                   const set<NFAVertex> &input_start_states,
+                                  const NGHolder &input_dag,
-                   set<NFAVertex> *states) {
+                                  const flat_set<NFAVertex> &input_start_states,
                                  const flat_set<NFAVertex> &initial_states) {
    DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
                 num_vertices(running_g), num_vertices(input_dag));
    assert(hasCorrectlyNumberedVertices(running_g));
@ -290,10 +292,8 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
    RevNFAGraph revg(input_dag.g);
    map<NFAVertex, dynamic_bitset<> > dfs_states;
-    vector<StateInfo> info;
+    auto info = makeInfoTable(running_g);
-    fillInfoTable(running_g, info);
+    auto input_fs = makeStateBitset(running_g, initial_states);
    dynamic_bitset<> input_fs(num_vertices(running_g));
    fillStateBitset(running_g, *states, input_fs);
    for (auto v : input_start_states) {
        dfs_states[v] = input_fs;
@ -303,21 +303,25 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
                      eg_visitor(running_g, info, input_dag, dfs_states),
                      make_assoc_property_map(colours));
-    fillVertexSet(dfs_states[input_dag.accept], info, *states);
+    auto states = getVertices(dfs_states[input_dag.accept], info);
 #ifdef DEBUG
    DEBUG_PRINTF("  output rstates:");
-        for (auto v : *states) {
+    for (const auto &v : states) {
        printf(" %u", running_g[v].index);
    }
    printf("\n");
 #endif
    return states;
 }
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
-                   set<NFAVertex> *states) {
+                                  const NGHolder &input_dag,
-    set<NFAVertex> input_start_states = {input_dag.start, input_dag.startDs};
+                                  const flat_set<NFAVertex> &initial_states) {
-    execute_graph(running_g, input_dag, input_start_states, states);
+    auto input_start_states = {input_dag.start, input_dag.startDs};
    return execute_graph(running_g, input_dag, input_start_states,
                         initial_states);
 }
 } // namespace ue2
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@ -35,8 +35,8 @@
 #define NG_EXECUTE_H
 #include "ng_holder.h"
 #include "util/ue2_containers.h"
 #include <set>
 #include <vector>
 namespace ue2 {
@ -44,23 +44,25 @@ namespace ue2 {
 class CharReach;
 struct ue2_literal;
-void execute_graph(const NGHolder &g, const ue2_literal &input,
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
-                   std::set<NFAVertex> *states, bool kill_sds = false);
+                                  const flat_set<NFAVertex> &initial,
                                  bool kill_sds = false);
-void execute_graph(const NGHolder &g, const std::vector<CharReach> &input,
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
-                   std::set<NFAVertex> *states);
+                                  const std::vector<CharReach> &input,
                                  const flat_set<NFAVertex> &initial);
 /** on exit, states contains any state which may still be enabled after
 * receiving an input which corresponds to some path through the input_dag from
 * start or startDs to accept. input_dag MUST be acyclic aside from self-loops.
 */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   std::set<NFAVertex> *states);
+                                  const flat_set<NFAVertex> &initial);
 /* as above, but able to specify the source states for the input graph */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   const std::set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &input_start_states,
-                   std::set<NFAVertex> *states);
+                                  const flat_set<NFAVertex> &initial);
 } // namespace ue2
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@ -114,7 +114,7 @@ void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) {
 }
 class Automaton_Base {
-public:
+protected:
    Automaton_Base(const NGHolder &graph_in,
                   const ue2::unordered_map<NFAVertex, u32> &state_ids_in)
        : graph(graph_in), state_ids(state_ids_in) {
@ -122,6 +122,7 @@ public:
        assert(alphasize <= ALPHABET_SIZE);
    }
 public:
    static bool canPrune(const flat_set<ReportID> &) { return false; }
    const NGHolder &graph;
@ -608,7 +609,6 @@ bool doHaig(const NGHolder &g,
    }
    haig_note_starts(g, &rdfa->new_som_nfa_states);
    rdfa->trigger_nfa_state = NODE_START;
    return true;
 }
@ -638,7 +638,8 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
        return nullptr;
    }
-    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
                                              somPrecision);
    DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
    bool rv;
@ -658,7 +659,6 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
    DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n",
                 rdfa->start_anchored, rdfa->start_floating);
    rdfa->stream_som_loc_width = somPrecision;
    assert(rdfa->kind == g.kind);
    return rdfa;
@ -782,7 +782,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
    typedef Automaton_Haig_Merge::StateSet StateSet;
    vector<StateSet> nfa_state_map;
-    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
                                              NODE_START,
                                              dfas[0]->stream_som_loc_width);
    int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
    if (rv) {
@ -830,11 +832,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
    }
    haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states);
    rdfa->trigger_nfa_state = NODE_START;
    DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
                 rdfa->start_anchored, rdfa->start_floating);
    rdfa->stream_som_loc_width = dfas[0]->stream_som_loc_width;
    return rdfa;
 }
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@ -98,8 +98,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
    info->packedCtrlSize = rsi.packedCtrlSize;
    info->horizon = rsi.horizon;
    info->minPeriod = minPeriod;
-    memcpy(&info->packedFieldSizes, rsi.packedFieldSizes.data(),
+    copy_bytes(&info->packedFieldSizes, rsi.packedFieldSizes);
           byte_length(rsi.packedFieldSizes));
    info->patchCount = rsi.patchCount;
    info->patchSize = rsi.patchSize;
    info->encodingSize = rsi.encodingSize;
@ -122,7 +121,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
        nfa->length = verify_u32(len);
        info->length = verify_u32(sizeof(RepeatInfo)
                                  + sizeof(u64a) * (rsi.patchSize + 1));
-        memcpy(table, rsi.table.data(), byte_length(rsi.table));
+        copy_bytes(table, rsi.table);
    }
 }
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@ -316,7 +316,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
    bool unbounded = false;
    bool exhaustible = can_exhaust(g, rm);
-    while (a) {
+    while (true) {
        if (is_special(a, g)) {
            DEBUG_PRINTF("stopped puffing due to special vertex\n");
            break;
@ -350,9 +350,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
        a = getSoleSourceVertex(g, a);
-        if (!a) {
+        assert(a); /* already checked that old a had a proper in degree of 1 */
            break;
        }
        // Snark: we can't handle this case, because we can only handle a
        // single report ID on a vertex
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@ -266,7 +266,7 @@ bool validateEXSL(const NGHolder &g,
    const vector<CharReach> escapes_vec(1, escapes);
    const vector<CharReach> notescapes_vec(1, ~escapes);
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states past the prefix */
    DEBUG_PRINTF("region %u is cutover\n", region);
    for (auto v : vertices_range(g)) {
@ -276,20 +276,20 @@ bool validateEXSL(const NGHolder &g,
    }
    /* process the escapes */
-    execute_graph(g, escapes_vec, &states);
+    states = execute_graph(g, escapes_vec, states);
    /* flood with any number of not escapes */
-    set<NFAVertex> prev_states;
+    ue2::flat_set<NFAVertex> prev_states;
    while (prev_states != states) {
        prev_states = states;
-        execute_graph(g, notescapes_vec, &states);
+        states = execute_graph(g, notescapes_vec, states);
        insert(&states, prev_states);
    }
    /* find input starts to use for when we are running the prefix through as
     * when the escape character arrives we may be in matching the prefix
     * already */
-    set<NFAVertex> prefix_start_states;
+    ue2::flat_set<NFAVertex> prefix_start_states;
    for (auto v : vertices_range(prefix)) {
        if (v != prefix.accept && v != prefix.acceptEod
            /* and as we have already made it past the prefix once */
@ -298,11 +298,12 @@ bool validateEXSL(const NGHolder &g,
        }
    }
-    execute_graph(prefix, escapes_vec, &prefix_start_states);
+    prefix_start_states =
        execute_graph(prefix, escapes_vec, prefix_start_states);
    assert(contains(prefix_start_states, prefix.startDs));
    /* see what happens after we feed it the prefix */
-    execute_graph(g, prefix, prefix_start_states, &states);
+    states = execute_graph(g, prefix, prefix_start_states, states);
    for (auto v : states) {
        assert(v != g.accept && v != g.acceptEod); /* no cr -> should never be
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@ -136,7 +136,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
        return false;
    }
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states (except starts - avoid suffix matches) */
    /* If we were doing (1) we would also except states leading to accepts -
       avoid prefix matches */
@ -149,7 +149,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
    }
    /* run the prefix the main graph */
-    execute_graph(p, p, &states);
+    states = execute_graph(p, p, states);
    for (auto v : states) {
        /* need to check if this vertex may represent an infix match - ie
@ -313,7 +313,7 @@ bool sentClearsTail(const NGHolder &g,
     */
    u32 first_bad_region = ~0U;
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
    /* turn on all states */
    DEBUG_PRINTF("region %u is cutover\n", last_head_region);
    for (auto v : vertices_range(g)) {
@ -327,7 +327,7 @@ bool sentClearsTail(const NGHolder &g,
    }
    /* run the prefix the main graph */
-    execute_graph(g, sent, &states);
+    states = execute_graph(g, sent, states);
    /* .. and check if we are left with anything in the tail region */
    for (auto v : states) {
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@ -51,10 +51,16 @@ namespace ue2 {
 namespace {
-/** Filter out edges from start-to-start or accept-to-accept. */
+/**
 * Filter out special edges, or in the top-specific variant, start edges that
 * don't have the right top set.
 */
 struct SpecialEdgeFilter {
    SpecialEdgeFilter() {}
-    explicit SpecialEdgeFilter(const NGHolder *h_in) : h(h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in) : h(&h_in) {}
    explicit SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
        : h(&h_in), single_top(true), top(top_in) {}
    bool operator()(const NFAEdge &e) const {
        const NFAGraph &g = h->g;
        NFAVertex u = source(e, g), v = target(e, g);
@ -62,23 +68,33 @@ struct SpecialEdgeFilter {
            (is_any_accept(u, g) && is_any_accept(v, g))) {
            return false;
        }
        if (single_top) {
            if (u == h->start && g[e].top != top) {
                return false;
            }
            if (u == h->startDs) {
                return false;
            }
        }
        return true;
    }
 private:
    const NGHolder *h = nullptr;
    bool single_top = false;
    u32 top = 0;
 };
 } // namespace
 static
-depth findMinWidth(const NGHolder &h, NFAVertex src) {
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
                   NFAVertex src) {
    if (isLeafNode(src, h)) {
        return depth::unreachable();
    }
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> StartGraph;
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
    StartGraph g(h.g, SpecialEdgeFilter(&h));
    assert(hasCorrectlyNumberedVertices(h));
    const size_t num = num_vertices(h);
@ -112,7 +128,8 @@ depth findMinWidth(const NGHolder &h, NFAVertex src) {
 }
 static
-depth findMaxWidth(const NGHolder &h, NFAVertex src) {
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
                   NFAVertex src) {
    if (isLeafNode(src, h.g)) {
        return depth::unreachable();
    }
@ -122,8 +139,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
        return depth::infinity();
    }
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> NodeFilteredGraph;
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
    NodeFilteredGraph g(h.g, SpecialEdgeFilter(&h));
    assert(hasCorrectlyNumberedVertices(h));
    const size_t num = num_vertices(h);
@ -164,7 +180,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
    if (d.is_unreachable()) {
        // If we're actually reachable, we'll have a min width, so we can
        // return infinity in this case.
-        if (findMinWidth(h, src).is_reachable()) {
+        if (findMinWidth(h, filter, src).is_reachable()) {
            return depth::infinity();
        }
        return d;
@ -175,11 +191,10 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
    return d - depth(1);
 }
-/** Returns the minimum width in bytes of an input that will match the given
+static
- * graph. */
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
-depth findMinWidth(const NGHolder &h) {
+    depth startDepth = findMinWidth(h, filter, h.start);
-    depth startDepth = findMinWidth(h, h.start);
+    depth dotstarDepth = findMinWidth(h, filter, h.startDs);
    depth dotstarDepth = findMinWidth(h, h.startDs);
    DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                 dotstarDepth.str().c_str());
    if (startDepth.is_unreachable()) {
@ -194,11 +209,18 @@ depth findMinWidth(const NGHolder &h) {
    }
 }
-/** Returns the maximum width in bytes of an input that will match the given
+depth findMinWidth(const NGHolder &h) {
- * graph. If there is no maximum width, returns infinity. */
+    return findMinWidth(h, SpecialEdgeFilter(h));
-depth findMaxWidth(const NGHolder &h) {
+}
-    depth startDepth = findMaxWidth(h, h.start);
+
-    depth dotstarDepth = findMaxWidth(h, h.startDs);
+depth findMinWidth(const NGHolder &h, u32 top) {
    return findMinWidth(h, SpecialEdgeFilter(h, top));
 }
 static
 depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
    depth startDepth = findMaxWidth(h, filter, h.start);
    depth dotstarDepth = findMaxWidth(h, filter, h.startDs);
    DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                 dotstarDepth.str().c_str());
    if (startDepth.is_unreachable()) {
@ -210,4 +232,12 @@ depth findMaxWidth(const NGHolder &h) {
    }
 }
 depth findMaxWidth(const NGHolder &h) {
    return findMaxWidth(h, SpecialEdgeFilter(h));
 }
 depth findMaxWidth(const NGHolder &h, u32 top) {
    return findMaxWidth(h, SpecialEdgeFilter(h, top));
 }
 } // namespace ue2
--- a/src/nfagraph/ng_width.h
+++ b/src/nfagraph/ng_width.h
@ -41,14 +41,34 @@ namespace ue2 {
 class NGHolder;
-/** Returns the minimum width in bytes of an input that will match the given
+/**
- * graph. */
+ * \brief Compute the minimum width in bytes of an input that will match the
 * given graph.
 */
 depth findMinWidth(const NGHolder &h);
-/** Returns the maximum width in bytes of an input that will match the given
+/**
- * graph. If there is no maximum width, returns infinity. */
+ * \brief Compute the minimum width in bytes of an input that will match the
 * given graph, considering only paths activated by the given top.
 */
 depth findMinWidth(const NGHolder &h, u32 top);
 /**
 * \brief Compute the maximum width in bytes of an input that will match the
 * given graph.
 *
 * If there is no bound on the maximum width, returns infinity.
 */
 depth findMaxWidth(const NGHolder &h);
 /**
 * \brief Compute the maximum width in bytes of an input that will match the
 * given graph, considering only paths activated by the given top.
 *
 * If there is no bound on the maximum width, returns infinity.
 */
 depth findMaxWidth(const NGHolder &h, u32 top);
 } // namespace ue2
 #endif // NG_WIDTH_H
--- a/src/parser/AsciiComponentClass.cpp
+++ b/src/parser/AsciiComponentClass.cpp
@ -52,7 +52,8 @@ AsciiComponentClass *AsciiComponentClass::clone() const {
 }
 bool AsciiComponentClass::class_empty(void) const {
-    return cr.none() && cr_ucp.none();
+    assert(finalized);
    return cr.none();
 }
 void AsciiComponentClass::createRange(unichar to) {
@ -60,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
    unsigned char from = (u8)range_start;
    if (from > to) {
        throw LocatedParseError("Range out of order in character class");
    } else {
        in_cand_range = false;
        cr.setRange(from, to);
        range_start = INVALID_UNICODE;
    }
    in_cand_range = false;
    CharReach ncr(from, to);
    if (mode.caseless) {
        make_caseless(&ncr);
    }
    cr |= ncr;
    range_start = INVALID_UNICODE;
 }
 void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@ -94,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
        c = translateForUcpMode(c, mode);
    }
    // Note: caselessness is handled by getPredefinedCharReach.
    CharReach pcr = getPredefinedCharReach(c, mode);
    if (negative) {
        pcr.flip();
    }
    if (isUcp(c)) {
        cr_ucp |= pcr;
    } else {
    cr |= pcr;
    }
    range_start = INVALID_UNICODE;
    in_cand_range = false;
 }
@ -119,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
        return;
    }
-    cr.set(c);
+    CharReach ncr(c, c);
    if (mode.caseless) {
        make_caseless(&ncr);
    }
    cr |= ncr;
    range_start = c;
 }
@ -135,12 +142,6 @@ void AsciiComponentClass::finalize() {
        in_cand_range = false;
    }
    if (mode.caseless) {
        make_caseless(&cr);
    }
    cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
    if (m_negate) {
        cr.flip();
    }
--- a/src/parser/AsciiComponentClass.h
+++ b/src/parser/AsciiComponentClass.h
@ -78,12 +78,10 @@ protected:
 private:
    Position position;
    CharReach cr;
    CharReach cr_ucp;
    // Private copy ctor. Use clone instead.
    AsciiComponentClass(const AsciiComponentClass &other)
-        : ComponentClass(other), position(other.position), cr(other.cr),
+        : ComponentClass(other), position(other.position), cr(other.cr) {}
          cr_ucp(other.cr_ucp) {}
 };
 } // namespace ue2
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
    case CLASS_DIGIT:
        return number;
    case CLASS_GRAPH:
    case CLASS_XGRAPH:
        return CharReach(0x21, 0x7e);
    case CLASS_XGRAPH:
        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_HORZ:
        return CharReach("\x09\x20\xA0");
    case CLASS_LOWER:
@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
        }
    case CLASS_PRINT:
        return CharReach(0x20, 0x7e);
    case CLASS_XPRINT:
        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_PUNCT:
        return CharReach(0x21, '0' - 1)
            | CharReach('9' + 1, 'A' - 1)
            | CharReach('Z' + 1, 'a' - 1)
            | CharReach('z' + 1, 126);
    case CLASS_XPUNCT:
        return to_cr(getPredefinedCodePointSet(c, mode));
    case CLASS_SPACE:
        return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
    case CLASS_UPPER:
@ -420,7 +425,7 @@ unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
 ComponentClass::ComponentClass(const ParseMode &mode_in)
    : m_negate(false), mode(mode_in), in_cand_range(false),
-      range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
+      range_start(INVALID_UNICODE), finalized(false) {}
 ComponentClass::~ComponentClass() { }
@ -441,7 +446,6 @@ void ComponentClass::addDash(void) {
 }
 void ComponentClass::negate() {
    assert(class_empty());
    m_negate = true;
 }
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@ -63,7 +63,9 @@ enum PredefinedClass {
    CLASS_VERT,
    CLASS_WORD,
    CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
    CLASS_XPRINT, /* [:print:] in UCP mode */
    CLASS_XPUNCT, /* [:punct:] in UCP mode */
    CLASS_UCP_C,
    CLASS_UCP_CC,
    CLASS_UCP_CF,
@ -232,8 +234,12 @@ public:
    Component *accept(ComponentVisitor &v) override = 0;
    void accept(ConstComponentVisitor &v) const override = 0;
-     /** True iff we have already started adding members to the class. This is
+    /** \brief True if the class contains no members (i.e. it will not match
-      * a different concept to Component::empty */
+     * against anything). This function can only be called on a finalized
     * class.
     *
     * Note: This is a different concept to Component::empty.
     */
    virtual bool class_empty(void) const = 0;
    virtual void add(PredefinedClass c, bool negated) = 0;
@ -245,9 +251,6 @@ public:
    bool isNegated() const { return m_negate; }
    void setFirstChar(char c) { firstChar = c; }
    char getFirstChar() const { return firstChar; }
    std::vector<PositionInfo> first() const override = 0;
    std::vector<PositionInfo> last() const override = 0;
    bool empty() const override { return false; } /* always 1 codepoint wide */
@ -263,19 +266,13 @@ protected:
    unichar range_start;
    bool finalized;
    /** Literal character at the start of this character class, e.g. '.' for
     * the class [.abc]. Used to identify (unsupported) POSIX collating
     * elements. */
    char firstChar;
    virtual void createRange(unichar) = 0;
    // Protected copy ctor. Use clone instead.
    ComponentClass(const ComponentClass &other)
        : Component(other), m_negate(other.m_negate), mode(other.mode),
          in_cand_range(other.in_cand_range), range_start(other.range_start),
-          finalized(other.finalized),
+          finalized(other.finalized) {}
          firstChar(other.firstChar) {}
 };
 } // namespace ue2
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
        assert(!inCharClass); // not reentrant
        currentCls = getComponentClass(mode);
        inCharClass = true;
        inCharClassEarly = true;
        currentClsBegin = ts;
        fgoto readClass;
    }
@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
    }
    action is_utf8 { mode.utf8 }
    action is_ignore_space { mode.ignore_space }
    action is_early_charclass { inCharClassEarly }
    action addNumberedBackRef {
        if (accumulator == 0) {
@ -790,10 +792,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
        any => { throw LocatedParseError("Unknown property"); };
                     *|;
    charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
+              # We don't support POSIX collating elements (neither does PCRE
-              '\[\.' [^\]]* '\.\]' | 
+              # or Perl). These look like [.ch.] or [=ch=].
-              '\[=' [^\]]* '=\]' => {
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
                  throw LocatedParseError("Unsupported POSIX collating "
                                          "element");
              };
              # Named sets
              # Adding these may cause the charclass to close, hence the
@ -889,11 +893,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throw LocatedParseError("Invalid POSIX named class");
              };
              '\\Q' => {
-                  // fcall readQuotedClass;
+                  fcall readQuotedClass;
                  ostringstream str;
                  str << "\\Q..\\E sequences in character classes not supported at index "
                      << ts - ptr << ".";
                  throw ParseError(str.str());
              };
              '\\E' => { /*noop*/};
              # Backspace (this is only valid for \b in char classes)
@ -1090,28 +1090,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throwInvalidUtf8();
              };
              # dot or equals at the end of a character class could be the end
              # of a collating element, like [.blah.] or [=blah=].
              [.=] ']' => {
                  if (currentCls->getFirstChar() == *ts) {
                      assert(currentClsBegin);
                      ostringstream oss;
                      oss << "Unsupported POSIX collating element at index "
                          << currentClsBegin - ptr << ".";
                      throw ParseError(oss.str());
                  }
                  currentCls->add(*ts);
                  currentCls->finalize();
                  currentSeq->addComponent(move(currentCls));
                  inCharClass = false;
                  fgoto main;
              };
              # Literal character
              (any - ']') => {
                  if (currentCls->class_empty()) {
                      currentCls->setFirstChar(*ts);
                  }
                  currentCls->add(*ts);
              };
@ -1127,35 +1107,35 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
    # Parser to read stuff from a character class
    #############################################################
    readClass := |*
-        # the negate and right bracket out the front are special
+        # A caret at the beginning of the class means that the rest of the
-        '\^' => {
+        # class is negated.
        '\^' when is_early_charclass => {
            if (currentCls->isNegated()) {
                // Already seen a caret; the second one is not a meta-character.
                inCharClassEarly = false;
                fhold; fgoto charClassGuts;
            } else {
                currentCls->negate();
                // Note: we cannot switch off inCharClassEarly here, as /[^]]/
                // needs to use the right square bracket path below.
            }
        };
-        ']' => {
+        # A right square bracket before anything "real" is interpreted as a
-            // if this is the first thing in the class, add it and move along,
+        # literal right square bracket.
-            // otherwise jump into the char class machine to handle what might
+        ']' when is_early_charclass => {
            // end up as fail
            if (currentCls->class_empty()) {
            currentCls->add(']');
-            } else {
+            inCharClassEarly = false;
                // leave it for the next machine
                fhold;
            }
            fgoto charClassGuts;
        };
        # if we hit a quote before anything "real", handle it
-        #'\\Q' => { fcall readQuotedClass; };
+        '\\Q' => { fcall readQuotedClass; };
        '\\Q' => {
            throw LocatedParseError("\\Q..\\E sequences in character classes not supported");
        };
        '\\E' => { /*noop*/};
        # time for the real work to happen
-        any => { fhold; fgoto charClassGuts; };
+        any => {
            inCharClassEarly = false;
            fhold;
            fgoto charClassGuts;
        };
        *|;
    #############################################################
@ -1183,6 +1163,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
              # Literal character
              any => {
                  currentCls->add(*ts);
                  inCharClassEarly = false;
              };
            *|;
@ -1232,6 +1213,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                  throw LocatedParseError("POSIX named classes are only "
                                          "supported inside a class");
              };
              # We don't support POSIX collating elements (neither does PCRE
              # or Perl). These look like [.ch.] or [=ch=].
              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
              '\[=' ( '\\]' | [^\]] )* '=\]' => {
                  throw LocatedParseError("Unsupported POSIX collating "
                                          "element");
              };
              # Begin eating characters for class
              '\[' => eatClass;
              # Begin quoted literal
@ -1896,6 +1884,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
    // brackets [..].
    bool inCharClass = false;
    // True if the machine is inside a character class but it has not processed
    // any "real" elements yet, i.e. it's still processing meta-characters like
    // '^'.
    bool inCharClassEarly = false;
    // Location at which the current character class began.
    const u8 *currentClsBegin = p;
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
        } else {
            return CLASS_UCP_LL;
        }
    case CLASS_PRINT:
        return CLASS_XPRINT;
    case CLASS_PUNCT:
        return CLASS_XPUNCT;
    case CLASS_SPACE:
        return CLASS_UCP_XPS;
    case CLASS_UPPER:
@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
    }
 }
 static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                       const ParseMode &mode) {
    /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@ -117,6 +120,22 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
        rv |= cf;
        return rv;
    }
    case CLASS_XPRINT: {
        // Same as graph, plus everything with the Zs property.
        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
        rv |= getUcpZs();
        rv.set(0x180e); // Also included in this class by PCRE 8.38.
        return rv;
    }
    case CLASS_XPUNCT: {
        // Everything with the P (punctuation) property, plus code points in S
        // (symbols) that are < 128.
        CodePointSet rv = getUcpP();
        CodePointSet symbols = getUcpS();
        symbols.unsetRange(128, MAX_UNICODE);
        rv |= symbols;
        return rv;
    }
    case CLASS_HORZ: {
        CodePointSet rv;
        rv.set(0x0009); /* Horizontal tab */
@ -484,7 +503,8 @@ UTF8ComponentClass *UTF8ComponentClass::clone() const {
 }
 bool UTF8ComponentClass::class_empty(void) const {
-    return cps.none() && cps_ucp.none();
+    assert(finalized);
    return cps.none();
 }
 void UTF8ComponentClass::createRange(unichar to) {
@ -492,7 +512,8 @@ void UTF8ComponentClass::createRange(unichar to) {
    unichar from = range_start;
    if (from > to) {
        throw LocatedParseError("Range out of order in character class");
-    } else {
+    }
    in_cand_range = false;
    CodePointSet ncps;
    ncps.setRange(from, to);
@ -502,7 +523,6 @@ void UTF8ComponentClass::createRange(unichar to) {
    cps |= ncps;
    range_start = INVALID_UNICODE;
 }
 }
 void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
    if (in_cand_range) { // can't form a range here
@ -520,11 +540,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
        pcps.flip();
    }
    if (isUcp(c)) {
        cps_ucp |= pcps;
    } else {
    cps |= pcps;
    }
    range_start = INVALID_UNICODE;
    in_cand_range = false;
@ -562,8 +578,6 @@ void UTF8ComponentClass::finalize() {
        in_cand_range = false;
    }
    cps |= cps_ucp; /* characters from ucp props always case sensitive */
    if (m_negate) {
        cps.flip();
    }
@ -571,31 +585,6 @@ void UTF8ComponentClass::finalize() {
    finalized = true;
 }
 bool isUcp(PredefinedClass c) {
    switch (c) {
    case CLASS_ALNUM:
    case CLASS_ALPHA:
    case CLASS_ANY:
    case CLASS_ASCII:
    case CLASS_BLANK:
    case CLASS_CNTRL:
    case CLASS_DIGIT:
    case CLASS_GRAPH:
    case CLASS_HORZ:
    case CLASS_LOWER:
    case CLASS_PRINT:
    case CLASS_PUNCT:
    case CLASS_SPACE:
    case CLASS_UPPER:
    case CLASS_VERT:
    case CLASS_WORD:
    case CLASS_XDIGIT:
        return false;
    default:
        return true;
    }
 }
 Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
    map<u8, Position>::const_iterator it = heads.find(first_byte);
    if (it != heads.end()) {
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@ -93,7 +93,6 @@ private:
    void buildFourByte(GlushkovBuildState &bs);
    CodePointSet cps;
    CodePointSet cps_ucp;
    std::map<u8, Position> heads;
    Position single_pos;
@ -108,7 +107,9 @@ private:
 };
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
-bool isUcp(PredefinedClass c);
+
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                       const ParseMode &mode);
 } // namespace
--- a/src/parser/check_refs.cpp
+++ b/src/parser/check_refs.cpp
@ -57,7 +57,7 @@ public:
    ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
        : num_ids(num_groups), names(targets) {}
-    ~ReferenceVisitor();
+    ~ReferenceVisitor() override;
    void invalid_index(const char *component, unsigned id) {
        assert(component);
--- a/src/parser/prefilter.cpp
+++ b/src/parser/prefilter.cpp
@ -201,7 +201,7 @@ const ComponentSequence *findCapturingGroup(const Component *root,
 class PrefilterVisitor : public DefaultComponentVisitor {
 public:
    PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
-    ~PrefilterVisitor();
+    ~PrefilterVisitor() override;
    /** \brief Calls the visitor (recursively) on a new replacement component
     * we've just created. Takes care of freeing it if the sequence is itself
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@ -64,7 +64,7 @@ namespace ue2 {
 */
 class ConstructLiteralVisitor : public ConstComponentVisitor {
 public:
-    ~ConstructLiteralVisitor();
+    ~ConstructLiteralVisitor() override;
    /** \brief Thrown if this component does not represent a literal. */
    struct NotLiteral {};
--- a/src/parser/unsupported.cpp
+++ b/src/parser/unsupported.cpp
@ -44,7 +44,7 @@ namespace ue2 {
 * an unsupported component. */
 class UnsupportedVisitor : public DefaultConstComponentVisitor {
 public:
-    ~UnsupportedVisitor();
+    ~UnsupportedVisitor() override;
    void pre(const ComponentAssertion &) override {
        throw ParseError("Zero-width assertions are not supported.");
    }
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@ -379,7 +379,7 @@ void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) {
    DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi);
    if (final_loc >= q_last_loc(q)) {
        /* TODO: ensure situation does not arise */
-        assert(q->items[q->end - 1].type != MQE_END);
+        assert(q_last_type(q) != MQE_END);
        pushQueueNoMerge(q, MQE_END, final_loc);
    }
 }
--- a/src/rose/match.c
+++ b/src/rose/match.c
@ -758,7 +758,7 @@ found_miracle:
    q_skip_forward_to(q, miracle_loc);
-    if (q->items[q->end - 1].type == MQE_START) {
+    if (q_last_type(q) == MQE_START) {
        DEBUG_PRINTF("miracle caused infix to die\n");
        return 0;
    }
@ -853,7 +853,7 @@ char roseTestLeftfix(const struct RoseEngine *t, const struct RoseRole *tr,
        }
    }
-    if (q_cur_loc(q) < loc || q->items[q->end - 1].type != MQE_START) {
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
        if (left->infix) {
            if (infixTooOld(q, loc)) {
                DEBUG_PRINTF("infix %u died of old age\n", ri);
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@ -42,6 +42,7 @@
 #include "rose_in_graph.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
 #include "util/ue2_containers.h"
 #include "util/ue2string.h"
 #include <memory>
@ -72,8 +73,8 @@ public:
    /** \brief True if we can not establish that at most a single callback will
     * be generated at a given offset from this set of reports. */
-    virtual bool requiresDedupeSupport(const std::set<ReportID> &reports) const
+    virtual bool requiresDedupeSupport(const ue2::flat_set<ReportID> &reports)
-        = 0;
+        const = 0;
 };
 /** \brief Abstract interface intended for callers from elsewhere in the tree,
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@ -271,16 +271,13 @@ public:
    typedef Holder_StateSet StateSet;
    typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
-    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in), bad(false) {
+    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
        for (auto v : vertices_range(g)) {
            vertexToIndex[v] = indexToVertex.size();
            indexToVertex.push_back(v);
        }
-        if (indexToVertex.size() > ANCHORED_NFA_STATE_LIMIT) {
+        assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
            bad = true;
            return;
        }
        DEBUG_PRINTF("%zu states\n", indexToVertex.size());
        init.wdelay = 0;
@ -400,7 +397,6 @@ public:
    array<u16, ALPHABET_SIZE> alpha;
    array<u16, ALPHABET_SIZE> unalpha;
    u16 alphasize;
    bool bad;
 };
 } // namespace
@ -670,13 +666,13 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
 static
 int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
-    Automaton_Holder autom(h);
+    if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
    if (autom.bad) {
        DEBUG_PRINTF("autom bad!\n");
        return ANCHORED_FAIL;
    }
    Automaton_Holder autom(h);
    unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
    if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
        return finalise_out(tbi, h, autom, move(out_dfa), remap);
@ -738,7 +734,6 @@ void buildSimpleDfas(const RoseBuildImpl &tbi,
        NGHolder h;
        populate_holder(simple.first, exit_ids, &h);
        Automaton_Holder autom(h);
        assert(!autom.bad);
        unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
        UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
        assert(!rv);
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@ -2687,12 +2687,6 @@ void fillInReportInfo(RoseEngine *engine, u32 reportOffset,
                 sizeof(internal_report));
 }
 static
 void populateInvDkeyTable(char *ptr, const ReportManager &rm) {
    vector<ReportID> table = rm.getDkeyToReportTable();
    memcpy(ptr, table.data(), byte_length(table));
 }
 static
 bool hasSimpleReports(const vector<Report> &reports) {
    auto it = find_if(reports.begin(), reports.end(), isComplexReport);
@ -4154,7 +4148,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
    engine->ekeyCount = rm.numEkeys();
    engine->dkeyCount = rm.numDkeys();
    engine->invDkeyOffset = dkeyOffset;
-    populateInvDkeyTable(ptr + dkeyOffset, rm);
+    copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());
    engine->somHorizon = ssm.somPrecision();
    engine->somLocationCount = ssm.numSomSlots();
@ -4314,33 +4308,22 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
    buildLitBenefits(*this, engine.get(), base_lits_benefits_offset);
    // Copy in other tables
-    memcpy(ptr + bc.engine_blob_base, bc.engine_blob.data(),
+    copy_bytes(ptr + bc.engine_blob_base, bc.engine_blob);
-           byte_length(bc.engine_blob));
+    copy_bytes(ptr + engine->literalOffset, literalTable);
-
+    copy_bytes(ptr + engine->roleOffset, bc.roleTable);
-    memcpy(ptr + engine->literalOffset, literalTable.data(),
+    copy_bytes(ptr + engine->leftOffset, leftInfoTable);
           byte_length(literalTable));
    memcpy(ptr + engine->roleOffset, bc.roleTable.data(),
           byte_length(bc.roleTable));
    copy(leftInfoTable.begin(), leftInfoTable.end(),
         (LeftNfaInfo *)(ptr + engine->leftOffset));
    fillLookaroundTables(ptr + lookaroundTableOffset,
                         ptr + lookaroundReachOffset, bc.lookaround);
    fillInSomRevNfas(engine.get(), ssm, rev_nfa_table_offset, rev_nfa_offsets);
-    memcpy(ptr + engine->predOffset, predTable.data(), byte_length(predTable));
+    copy_bytes(ptr + engine->predOffset, predTable);
-    memcpy(ptr + engine->rootRoleOffset, rootRoleTable.data(),
+    copy_bytes(ptr + engine->rootRoleOffset, rootRoleTable);
-           byte_length(rootRoleTable));
+    copy_bytes(ptr + engine->anchoredReportMapOffset, art);
-    memcpy(ptr + engine->anchoredReportMapOffset, art.data(), byte_length(art));
+    copy_bytes(ptr + engine->anchoredReportInverseMapOffset, arit);
-    memcpy(ptr + engine->anchoredReportInverseMapOffset, arit.data(),
+    copy_bytes(ptr + engine->multidirectOffset, mdr_reports);
-           byte_length(arit));
+    copy_bytes(ptr + engine->activeLeftIterOffset, activeLeftIter);
-    memcpy(ptr + engine->multidirectOffset, mdr_reports.data(),
+    copy_bytes(ptr + engine->sideOffset, sideTable);
           byte_length(mdr_reports));
    copy(activeLeftIter.begin(), activeLeftIter.end(),
         (mmbit_sparse_iter *)(ptr + engine->activeLeftIterOffset));
    memcpy(ptr + engine->sideOffset, sideTable.data(), byte_length(sideTable));
    DEBUG_PRINTF("rose done %p\n", engine.get());
    return engine;
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@ -1631,20 +1631,23 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
    assert(left.graph());
    const NGHolder &h = *left.graph();
    ue2::flat_set<NFAVertex> all_states;
    insert(&all_states, vertices(h));
    assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
    DEBUG_PRINTF("removing sds\n");
    all_states.erase(h.startDs);
    ue2::flat_set<NFAVertex> states;
    /* check each pred literal to see if they all kill previous graph
     * state */
    for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
        const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
        const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
        set<NFAVertex> states;
        insert(&states, vertices(h));
        assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
        DEBUG_PRINTF("removing sds\n");
        states.erase(h.startDs);
        DEBUG_PRINTF("running graph %zu\n", states.size());
-        execute_graph(h, s, &states, true);
+        states = execute_graph(h, s, all_states, true);
-        DEBUG_PRINTF("ran\n");
+        DEBUG_PRINTF("ran, %zu states on\n", states.size());
        if (!states.empty()) {
            return false;
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@ -130,6 +130,8 @@ private:
    friend depth findMinWidth(const suffix_id &s);
    friend depth findMaxWidth(const suffix_id &s);
    friend depth findMinWidth(const suffix_id &s, u32 top);
    friend depth findMaxWidth(const suffix_id &s, u32 top);
 };
 std::set<ReportID> all_reports(const suffix_id &s);
@ -138,6 +140,8 @@ bool has_eod_accepts(const suffix_id &s);
 bool has_non_eod_accepts(const suffix_id &s);
 depth findMinWidth(const suffix_id &s);
 depth findMaxWidth(const suffix_id &s);
 depth findMinWidth(const suffix_id &s, u32 top);
 depth findMaxWidth(const suffix_id &s, u32 top);
 size_t hash_value(const suffix_id &s);
 /** \brief represents an engine to the left of a rose role */
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@ -77,6 +77,8 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
      hasSom(false),
      group_weak_end(0),
      group_end(0),
      anchored_base_id(MO_INVALID_IDX),
      nonbenefits_base_id(MO_INVALID_IDX),
      ematcher_region_size(0),
      floating_direct_report(false),
      eod_event_literal_id(MO_INVALID_IDX),
@ -536,7 +538,7 @@ u32 RoseBuildImpl::getNewLiteralId() {
 }
 static
-bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
+bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                    const Grey &grey) {
    /* TODO: tighten */
    NFAVertex seen_vert = NFAGraph::null_vertex();
@ -579,7 +581,8 @@ bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
 class RoseDedupeAuxImpl : public RoseDedupeAux {
 public:
    explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
-    bool requiresDedupeSupport(const set<ReportID> &reports) const override;
+    bool requiresDedupeSupport(
        const ue2::flat_set<ReportID> &reports) const override;
    const RoseBuildImpl &tbi;
    map<ReportID, set<RoseVertex>> vert_map;
@ -599,6 +602,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
    : tbi(tbi_in) {
    const RoseGraph &g = tbi.g;
    set<suffix_id> suffixes;
    for (auto v : vertices_range(g)) {
        // Literals in the small block table don't count as dupes: although
        // they have copies in the anchored table, the two are never run in the
@ -609,11 +614,17 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
            }
        }
        // Several vertices may share a suffix, so we collect the set of
        // suffixes first to avoid repeating work.
        if (g[v].suffix) {
-            for (const auto &report_id : all_reports(g[v].suffix)) {
+            suffixes.insert(g[v].suffix);
                suffix_map[report_id].insert(g[v].suffix);
        }
    }
    for (const auto &suffix : suffixes) {
        for (const auto &report_id : all_reports(suffix)) {
            suffix_map[report_id].insert(suffix);
        }
    }
    for (const auto &outfix : tbi.outfixes) {
@ -634,8 +645,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
    }
 }
-bool RoseDedupeAuxImpl::requiresDedupeSupport(const set<ReportID> &reports)
+bool RoseDedupeAuxImpl::requiresDedupeSupport(
-    const {
+    const ue2::flat_set<ReportID> &reports) const {
    /* TODO: this could be expanded to check for offset or character
       constraints */
@ -897,6 +908,17 @@ depth findMinWidth(const suffix_id &s) {
    }
 }
 depth findMinWidth(const suffix_id &s, u32 top) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
        return findMinWidth(*s.graph(), top);
    } else if (s.castle()) {
        return findMinWidth(*s.castle(), top);
    } else {
        return s.dfa_min_width;
    }
 }
 depth findMaxWidth(const suffix_id &s) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
@ -908,6 +930,17 @@ depth findMaxWidth(const suffix_id &s) {
    }
 }
 depth findMaxWidth(const suffix_id &s, u32 top) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
        return findMaxWidth(*s.graph(), top);
    } else if (s.castle()) {
        return findMaxWidth(*s.castle(), top);
    } else {
        return s.dfa_max_width;
    }
 }
 bool has_eod_accepts(const suffix_id &s) {
    assert(s.graph() || s.castle() || s.haig() || s.dfa());
    if (s.graph()) {
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@ -439,12 +439,16 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
    hash_combine(val, hash_range(begin(props.reports), end(props.reports)));
    if (props.suffix) {
-        hash_combine(val, all_reports(props.suffix));
+        const auto &suffix = props.suffix;
-        if (props.suffix.graph) {
+        if (suffix.castle) {
-            hash_combine(val, num_vertices(*props.suffix.graph));
+            hash_combine(val, suffix.castle->reach());
            hash_combine(val, suffix.castle->repeats.size());
        }
-        if (props.suffix.haig) {
+        if (suffix.graph) {
-            hash_combine(val, hash_dfa(*props.suffix.haig));
+            hash_combine(val, num_vertices(*suffix.graph));
        }
        if (suffix.haig) {
            hash_combine(val, hash_dfa(*suffix.haig));
        }
    }
@ -747,14 +751,17 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
 * Castle. */
 static
 void pruneCastle(CastleProto &castle, ReportID report) {
-    for (map<u32, PureRepeat>::iterator it = castle.repeats.begin();
+    unordered_set<u32> dead; // tops to remove.
-         it != castle.repeats.end(); /* incr inside */) {
+    for (const auto &m : castle.repeats) {
-        if (contains(it->second.reports, report)) {
+        if (!contains(m.second.reports, report)) {
-            ++it;
+            dead.insert(m.first);
        } else {
            castle.repeats.erase(it++);
        }
    }
    for (const auto &top : dead) {
        castle.erase(top);
    }
    assert(!castle.repeats.empty());
 }
@ -794,7 +801,7 @@ void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
    for (u32 top : assoc_keys(castle.repeats)) {
        if (!contains(used_tops, top)) {
            DEBUG_PRINTF("removing unused top %u\n", top);
-            castle.repeats.erase(top);
+            castle.erase(top);
        }
    }
 }
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@ -94,10 +94,11 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
        }
        if (g[v].suffix) {
-            depth suffix_width = findMinWidth(g[v].suffix);
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
            assert(suffix_width.is_reachable());
-            DEBUG_PRINTF("%zu has suffix (width %s), can fire report at %u\n",
+            DEBUG_PRINTF("%zu has suffix with top %u (width %s), can fire "
-                         g[v].idx, suffix_width.str().c_str(),
+                         "report at %u\n",
                         g[v].idx, g[v].suffix.top, suffix_width.str().c_str(),
                         w + suffix_width);
            minWidth = min(minWidth, w + suffix_width);
        }
@ -146,8 +147,9 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi) {
            if (has_non_eod_accepts(g[v].suffix)) {
                return ROSE_BOUND_INF;
            }
-            depth suffix_width = findMaxWidth(g[v].suffix);
+            depth suffix_width = findMaxWidth(g[v].suffix, g[v].suffix.top);
-            DEBUG_PRINTF("suffix max width %s\n", suffix_width.str().c_str());
+            DEBUG_PRINTF("suffix max width for top %u is %s\n", g[v].suffix.top,
                         suffix_width.str().c_str());
            assert(suffix_width.is_reachable());
            if (!suffix_width.is_finite()) {
                DEBUG_PRINTF("suffix too wide\n");
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@ -167,7 +167,7 @@ found_miracle:
        DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc);
        q_skip_forward_to(q, miracle_loc);
-        if (q->items[q->end - 1].type == MQE_START) {
+        if (q_last_type(q) == MQE_START) {
            DEBUG_PRINTF("miracle caused infix to die\n");
            return MIRACLE_DEAD;
        }
--- a/src/util/compare.h
+++ b/src/util/compare.h
@ -98,18 +98,22 @@ u64a theirtoupper64(const u64a x) {
 static really_inline
 int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) {
    const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
+    for (; p1 < pEnd; p1++, p2++) {
-        if (mytolower(*p1) != mytolower(*p2))
+        if (mytolower(*p1) != mytolower(*p2)) {
            return 1;
        }
    }
    return 0;
 }
 static really_inline
 int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) {
    const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
+    for (; p1 < pEnd; p1++, p2++) {
-        if (*p1 != *p2)
+        if (*p1 != *p2) {
            return 1;
        }
    }
    return 0;
 }
--- a/src/util/container.h
+++ b/src/util/container.h
@ -33,8 +33,13 @@
 #ifndef UTIL_CONTAINER_H
 #define UTIL_CONTAINER_H
 #include "ue2common.h"
 #include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <set>
 #include <type_traits>
 #include <utility>
 namespace ue2 {
@ -92,11 +97,35 @@ std::set<typename C::key_type> assoc_keys(const C &container) {
    return keys;
 }
 /**
 * \brief Return the length in bytes of the given vector of (POD) objects.
 */
 template<typename T>
 typename std::vector<T>::size_type byte_length(const std::vector<T> &vec) {
    static_assert(std::is_pod<T>::value, "should be pod");
    return vec.size() * sizeof(T);
 }
 /**
 * \brief Copy the given vector of POD objects to the given location in memory.
 * It is safe to give this function an empty vector.
 */
 template<typename T>
 void *copy_bytes(void *dest, const std::vector<T> &vec) {
    static_assert(std::is_pod<T>::value, "should be pod");
    assert(dest);
    // Since we're generally using this function to write into the bytecode,
    // dest should be appropriately aligned for T.
    assert(ISALIGNED_N(dest, alignof(T)));
    if (vec.empty()) {
        return dest; // Protect memcpy against null pointers.
    }
    assert(vec.data() != nullptr);
    return std::memcpy(dest, vec.data(), byte_length(vec));
 }
 template<typename OrderedContainer1, typename OrderedContainer2>
 bool is_subset_of(const OrderedContainer1 &small, const OrderedContainer2 &big) {
    static_assert(std::is_same<typename OrderedContainer1::value_type,
--- a/src/util/depth.h
+++ b/src/util/depth.h
@ -183,7 +183,7 @@ public:
        s64a rv = val + d;
        if (rv < 0 || (u64a)rv >= val_infinity) {
-            DEBUG_PRINTF("depth %llu too large to represent!\n", rv);
+            DEBUG_PRINTF("depth %lld too large to represent!\n", rv);
            throw DepthOverflowError();
        }
--- a/src/util/multibit.c
+++ b/src/util/multibit.c
@ -142,23 +142,25 @@ const u32 mmbit_root_offset_from_level[7] = {
 u32 mmbit_size(u32 total_bits) {
    MDEBUG_PRINTF("%u\n", total_bits);
    // UE-2228: multibit has bugs in very, very large cases that we should be
    // protected against at compile time by resource limits.
    assert(total_bits <= 1U << 30);
    // Flat model multibit structures are just stored as a bit vector.
    if (total_bits <= MMB_FLAT_MAX_BITS) {
        return ROUNDUP_N(total_bits, 8) / 8;
    }
-    u32 current_level = 1;
+    u64a current_level = 1; // Number of blocks on current level.
-    u32 total = 0;
+    u64a total = 0;         // Total number of blocks.
    while (current_level * MMB_KEY_BITS < total_bits) {
        total += current_level;
        current_level <<= MMB_KEY_SHIFT;
    }
-    total += (total_bits + MMB_KEY_BITS - 1)/MMB_KEY_BITS;
+
-    return sizeof(MMB_TYPE) * total;
+    // Last level is a one-for-one bit vector. It needs room for total_bits
    // elements, rounded up to the nearest block.
    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
    total += last_level;
    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
    return (u32)(total * sizeof(MMB_TYPE));
 }
 #ifdef DUMP_SUPPORT
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@ -235,18 +235,18 @@ const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) {
 /** \brief get the block for this key on the current level as a u8 ptr */
 static really_inline
 u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
-    return mmbit_get_level_root(bits, level) +
+    u8 *level_root = mmbit_get_level_root(bits, level);
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
+    u32 ks = mmbit_get_ks(max_level, level);
-               sizeof(MMB_TYPE);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 /** \brief get the block for this key on the current level as a const u8 ptr */
 static really_inline
 const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level,
                                    u32 key) {
-    return mmbit_get_level_root_const(bits, level) +
+    const u8 *level_root = mmbit_get_level_root_const(bits, level);
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
+    u32 ks = mmbit_get_ks(max_level, level);
-               sizeof(MMB_TYPE);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 /** \brief get the _byte_ for this key on the current level as a u8 ptr */
@ -254,7 +254,7 @@ static really_inline
 u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
    u8 *level_root = mmbit_get_level_root(bits, level);
    u32 ks = mmbit_get_ks(max_level, level);
-    return level_root + (key >> (ks + MMB_KEY_SHIFT - 3));
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3));
 }
 /** \brief get our key value for the current level */
@ -721,11 +721,11 @@ u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
 }
 static really_inline
-MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
+MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
-                         u32 block_base) {
+                         u64a block_base) {
    const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
-    u32 lshift = (block_min - block_base) >> level_shift;
+    u64a lshift = (block_min - block_base) >> level_shift;
-    u32 ushift = (block_max - block_base) >> level_shift;
+    u64a ushift = (block_max - block_base) >> level_shift;
    MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0;
    MMB_TYPE umask =
        ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES;
@ -734,7 +734,7 @@ MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
 static really_inline
 u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
-    u32 key = 0;
+    u64a key = 0;
    u32 ks = mmbit_keyshift(total_bits);
    const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
    u32 level = 0;
@ -743,9 +743,9 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
        assert(level <= max_level);
        u32 block_width = MMB_KEY_BITS << ks;
-        u32 block_base = key*block_width;
+        u64a block_base = key * block_width;
-        u32 block_min = MAX(it_start, block_base);
+        u64a block_min = MAX(it_start, block_base);
-        u32 block_max = MIN(it_end, block_base + block_width - 1);
+        u64a block_max = MIN(it_end, block_base + block_width - 1);
        const u8 *block_ptr =
            mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
        MMB_TYPE block = mmb_load(block_ptr);
@ -761,13 +761,14 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
            // No bit found, go up a level
            // we know that this block didn't have any answers, so we can push
            // our start iterator forward.
-            it_start = block_base + block_width;
+            u64a next_start = block_base + block_width;
-            if (it_start > it_end) {
+            if (next_start > it_end) {
                break;
            }
            if (level-- == 0) {
                break;
            }
            it_start = next_start;
            key >>= MMB_KEY_SHIFT;
            ks += MMB_KEY_SHIFT;
        }
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@ -128,11 +128,9 @@ vector<ReportID> ReportManager::getDkeyToReportTable() const {
 }
 void ReportManager::assignDkeys(const RoseBuild *rose) {
    unique_ptr<RoseDedupeAux> dedupe = rose->generateDedupeAux();
    DEBUG_PRINTF("assigning...\n");
-    map<u32, set<ReportID>> ext_to_int;
+    map<u32, ue2::flat_set<ReportID>> ext_to_int;
    for (u32 i = 0; i < reportIds.size(); i++) {
        const Report &ir = reportIds[i];
@ -143,6 +141,8 @@ void ReportManager::assignDkeys(const RoseBuild *rose) {
        }
    }
    auto dedupe = rose->generateDedupeAux();
    for (const auto &m : ext_to_int) {
        u32 ext = m.first;
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@ -7,7 +7,8 @@ if(NOT XCODE)
 else()
    set(CMAKE_CXX_FLAGS "-isystem ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CXX_FLAGS}")
 endif()
-include_directories(${CMAKE_SOURCE_DIR}/util)
+
 include_directories(${PROJECT_SOURCE_DIR})
 # remove some warnings
 # cmake's scope means these only apply here
@ -26,7 +27,7 @@ endif()
 add_library(gtest ${gtest_SOURCES})
-add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${CMAKE_SOURCE_DIR})
+add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 if (NOT RELEASE_BUILD)
 set(unit_internal_SOURCES
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@ -85,7 +85,6 @@
 84:/[=\]=]/ #Unsupported POSIX collating element at index 0.
 85:/A(?!)+Z/ #Invalid repeat at index 5.
 86:/\X/ #\X unsupported at index 0.
 87:/[a\Qz\E]/ #\Q..\E sequences in character classes not supported at index 2.
 88:/[A-\d]/ #Invalid range in character class at index 3.
 89:/[A-[:digit:]]/ #Invalid range in character class at index 3.
 90:/B[--[:digit:]--]+/ #Invalid range in character class at index 4.
@ -128,3 +127,8 @@
 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ñññññññññññññññññññññññññññ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
 129:/bignum \1111111111111111111/ #Number is too big at index 7.
 130:/foo|&{5555555,}/ #Bounded repeat is too large.
 131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
 132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
 133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
 134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
 135:/[^\D\d]/8W #Pattern can never match.
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@ -363,7 +363,9 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
    ASSERT_TRUE(ba != nullptr);
    // Set one bit on and run some checks.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -381,7 +383,12 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
        // Scanning from one past our bit to the end should find nothing.
        if (i != test_size - 1) {
-            ASSERT_EQ(MMB_INVALID, mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
+            // Ordinary iterator.
            ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, i));
            // Bounded iterator.
            ASSERT_EQ(MMB_INVALID,
                      mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
        }
    }
 }
@ -393,7 +400,7 @@ TEST_P(MultiBitTest, BoundedIteratorAll) {
    // Switch everything on.
    fill_mmbit(ba, test_size);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        if (i != 0) {
            ASSERT_EQ(0U, mmbit_iterate_bounded(ba, test_size, 0, i));
        }
@ -408,13 +415,13 @@ TEST_P(MultiBitTest, BoundedIteratorEven) {
    // Set every even-numbered bit and see what we can see.
    mmbit_clear(ba, test_size);
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
        mmbit_set(ba, test_size, i);
    }
    u32 even_stride = stride % 2 ? stride + 1 : stride;
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
        // Scanning from each even bit to the end should find itself.
        ASSERT_EQ(i, mmbit_iterate_bounded(ba, test_size, i, test_size));
@ -439,13 +446,13 @@ TEST_P(MultiBitTest, BoundedIteratorOdd) {
    // Set every odd-numbered bit and see what we can see.
    mmbit_clear(ba, test_size);
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
        mmbit_set(ba, test_size, i);
    }
    u32 even_stride = stride % 2 ? stride + 1 : stride;
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
        // Scanning from each even bit to the end should find i+1.
        if (i+1 < test_size) {
            ASSERT_EQ(i+1, mmbit_iterate_bounded(ba, test_size, i, test_size));
@ -473,7 +480,7 @@ TEST_P(MultiBitTest, Set) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        // set a bit that wasn't set before
@ -500,7 +507,7 @@ TEST_P(MultiBitTest, Iter) {
    mmbit_clear(ba, test_size);
    ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -517,13 +524,13 @@ TEST_P(MultiBitTest, IterAll) {
    ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
    // Set all bits.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        mmbit_set(ba, test_size, i);
    }
    // Find all bits.
    u32 it = MMB_INVALID;
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        ASSERT_EQ(i, mmbit_iterate(ba, test_size, it));
        it = i;
    }
@ -536,7 +543,7 @@ TEST_P(MultiBitTest, AnyPrecise) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any_precise(ba, test_size));
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -551,7 +558,7 @@ TEST_P(MultiBitTest, Any) {
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        mmbit_clear(ba, test_size);
        mmbit_set(ba, test_size, i);
@ -567,7 +574,7 @@ TEST_P(MultiBitTest, UnsetRange1) {
    fill_mmbit(ba, test_size);
    // Use mmbit_unset_range to switch off any single bit.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_TRUE(mmbit_isset(ba, test_size, i));
        mmbit_unset_range(ba, test_size, i, i + 1);
@ -590,7 +597,7 @@ TEST_P(MultiBitTest, UnsetRange2) {
    // Use mmbit_unset_range to switch off all bits.
    mmbit_unset_range(ba, test_size, 0, test_size);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_FALSE(mmbit_isset(ba, test_size, i));
    }
@ -601,12 +608,12 @@ TEST_P(MultiBitTest, UnsetRange3) {
    ASSERT_TRUE(ba != nullptr);
    // Use mmbit_unset_range to switch off bits in chunks of 3.
-    for (u32 i = 0; i < test_size - 3; i += stride) {
+    for (u64a i = 0; i < test_size - 3; i += stride) {
        // Switch on the bit before, the bits in question, and the bit after.
        if (i > 0) {
            mmbit_set(ba, test_size, i - 1);
        }
-        for (u32 j = i; j < min(i + 4, test_size); j++) {
+        for (u64a j = i; j < min(i + 4, (u64a)test_size); j++) {
            mmbit_set(ba, test_size, j);
        }
@ -635,7 +642,7 @@ TEST_P(MultiBitTest, InitRangeAll) {
    mmbit_init_range(ba, test_size, 0, test_size);
    // Make sure they're all set.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        SCOPED_TRACE(i);
        ASSERT_TRUE(mmbit_isset(ba, test_size, i));
    }
@ -656,7 +663,7 @@ TEST_P(MultiBitTest, InitRangeOne) {
    SCOPED_TRACE(test_size);
    ASSERT_TRUE(ba != nullptr);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        mmbit_init_range(ba, test_size, i, i + 1);
        // Only bit 'i' should be on.
@ -685,7 +692,7 @@ TEST_P(MultiBitTest, InitRangeChunked) {
            ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));
            // All bits in the chunk should be on.
-            for (u32 i = chunk_begin; i < chunk_end; i += stride) {
+            for (u64a i = chunk_begin; i < chunk_end; i += stride) {
                SCOPED_TRACE(i);
                ASSERT_TRUE(mmbit_isset(ba, test_size, i));
            }
@ -985,7 +992,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1032,7 +1039,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
    // Switch every third bits on in state
    mmbit_clear(ba, test_size);
    ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += 3) {
+    for (u64a i = 0; i < test_size; i += 3) {
        mmbit_set(ba, test_size, i);
    }
@ -1044,7 +1051,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
    ASSERT_EQ(0U, val);
    ASSERT_EQ(0U, idx);
-    for (u32 i = 0; i < test_size - 3; i += 3) {
+    for (u64a i = 0; i < test_size - 3; i += 3) {
        mmbit_unset(ba, test_size, i);
        val = mmbit_sparse_iter_begin(ba, test_size, &idx, &it[0], &state[0]);
        ASSERT_EQ(i+3, val);
@ -1060,7 +1067,7 @@ TEST_P(MultiBitTest, SparseIteratorNextAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1103,7 +1110,7 @@ TEST_P(MultiBitTest, SparseIteratorNextExactStrided) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
        mmbit_set(ba, test_size, i);
    }
@ -1135,7 +1142,7 @@ TEST_P(MultiBitTest, SparseIteratorNextNone) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1164,7 +1171,7 @@ TEST_P(MultiBitTest, SparseIteratorUnsetAll) {
    vector<mmbit_sparse_iter> it;
    vector<u32> bits;
    bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
        bits.push_back(i);
    }
    mmbBuildSparseIterator(it, bits, test_size);
@ -1194,10 +1201,10 @@ TEST_P(MultiBitTest, SparseIteratorUnsetHalves) {
    // Two sparse iterators: one for even bits, one for odd ones
    vector<u32> even, odd;
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
        even.push_back(i);
    }
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
        odd.push_back(i);
    }
@ -1277,9 +1284,9 @@ static const MultiBitTestParam multibitTests[] = {
    { 1U << 28, 15073 },
    { 1U << 29, 24413 },
    { 1U << 30, 50377 },
    { 1U << 31, 104729 },
-    // XXX: cases this large segfault in mmbit_set, FIXME NOW
+    // { UINT32_MAX, 104729 }, // Very slow
    //{ 1U << 31, 3701 },
 };
 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));
--- a/unit/internal/nfagraph_find_matches.cpp
+++ b/unit/internal/nfagraph_find_matches.cpp
@ -36,9 +36,9 @@
 #include "nfagraph/ng_builder.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_asserts.h"
 #include "util/target_info.h"
 #include "hs_compile.h"
-#include "ng_find_matches.h"
+#include "util/ng_find_matches.h"
 #include "util/target_info.h"
 using namespace std;
 using namespace testing;
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@ -448,6 +448,25 @@ TEST_P(RepeatTest, Pack) {
    }
 }
 TEST_P(RepeatTest, LargeGap) {
    SCOPED_TRACE(testing::Message() << "Repeat: " << info);
    if (info.repeatMax == REPEAT_INF) {
        return; // Test not valid for FIRST-type repeats.
    }
    for (int i = 0; i < 64; i++) {
        u64a top1 = 1000;
        repeatStore(&info, ctrl, state, top1, 0); // first top
        ASSERT_EQ(top1, repeatLastTop(&info, ctrl, state));
        // Add a second top after a gap of 2^i bytes.
        u64a top2 = top1 + (1ULL << i);
        repeatStore(&info, ctrl, state, top2, 1); // second top
        ASSERT_EQ(top2, repeatLastTop(&info, ctrl, state));
    }
 }
 static
 const u32 sparsePeriods[] = {
    2,
@ -505,6 +524,7 @@ const RepeatTestInfo sparseRepeats[] = {
    { REPEAT_SPARSE_OPTIMAL_P, 4000, 4000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4500, 4500 },
    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5000 },
    { REPEAT_SPARSE_OPTIMAL_P, 65534, 65534 },
    // {N, M} repeats
    { REPEAT_SPARSE_OPTIMAL_P, 10, 20 },
    { REPEAT_SPARSE_OPTIMAL_P, 20, 40 },
@ -528,7 +548,8 @@ const RepeatTestInfo sparseRepeats[] = {
    { REPEAT_SPARSE_OPTIMAL_P, 3500, 4000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4000, 8000 },
    { REPEAT_SPARSE_OPTIMAL_P, 4500, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 }
+    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 },
    { REPEAT_SPARSE_OPTIMAL_P, 60000, 65534 }
 };
 static
@ -802,7 +823,7 @@ TEST_P(SparseOptimalTest, Simple1) {
                                  1000 + info->repeatMax * 2));
    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state,
                                  1000 + info->repeatMax * 2 + 1));
-    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 10000));
+    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 100000));
 }
 TEST_P(SparseOptimalTest, TwoTopsNeg) {
@ -893,6 +914,24 @@ TEST_P(SparseOptimalTest, Simple3e) {
    test_sparse3entryExpire(info, ctrl, state, 2 * info->minPeriod - 1);
 }
 TEST_P(SparseOptimalTest, LargeGap) {
    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);
    for (int i = 0; i < 64; i++) {
        u64a top1 = 1000;
        repeatStore(info, ctrl, state, top1, 0); // first top
        ASSERT_EQ(top1, repeatLastTop(info, ctrl, state));
        // Add a second top after a gap of 2^i bytes.
        u64a top2 = top1 + (1ULL << i);
        if (top2 - top1 < info->minPeriod) {
            continue; // not a valid top
        }
        repeatStore(info, ctrl, state, top2, 1); // second top
        ASSERT_EQ(top2, repeatLastTop(info, ctrl, state));
    }
 }
 TEST_P(SparseOptimalTest, ThreeTops) {
    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);