diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..c137017a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,36 @@
+# Hyperscan Change Log
+
+This is a list of notable changes to Hyperscan, in reverse chronological order.
+
+## [4.1.0] 2015-12-18
+- Update version of PCRE used by testing tools as a syntax and semantic
+  reference to PCRE 8.38.
+- Small updates to fix warnings identified by Coverity.
+- Clean up and unify exception handling behaviour across GPR and SIMD NFA
+  models.
+- Fix bug in handling of bounded repeat triggers with large gaps between them
+  for sparse repeat model.
+- Correctly reject POSIX collating elements (`[.ch.]`, `[=ch=]`) in the parser.
+  These are not supported by Hyperscan.
+- Add support for quoted sequences (`\Q...\E`) inside character classes.
+- Simplify FDR literal matcher runtime by removing some static specialization.
+- Fix handling of the POSIX `[:graph:]`, `[:print:]` and `[:punct:]` character
+  classes to match the behaviour of PCRE 8.38 in both standard operation and
+  with the UCP flag set. (Note: some bugs were fixed in this area in PCRE
+  8.38.) Previously Hyperscan's behaviour was the same as versions of PCRE
+  before 8.34.
+- Improve performance when compiling pattern sets that include a large number
+  of similar bounded repeat constructs. (github issue #9)
+
+## [4.0.1] 2015-10-30
+- Minor cleanups to test code.
+- CMake and other build system improvements.
+- API update: allow `hs_reset_stream()` and `hs_reset_and_copy_stream()` to be
+  supplied with a NULL scratch pointer if no matches are required. This is in
+  line with the behaviour of `hs_close_stream()`.
+- Disallow bounded repeats with a very large minimum repeat but no maximum,
+  i.e. {N,} for very large N.
+- Reduce compile memory usage in literal set explansion for some large cases.
+
+## [4.0.0] 2015-10-20
+- Original release of Hyperscan as open-source software.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67885c93..b4d81754 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 2.8.11)
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 0)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 1)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
 
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 INCLUDE (CheckFunctionExists)
@@ -56,8 +56,9 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
     set(XCODE TRUE)
 endif()
 
-include_directories(src .)
-include_directories(${CMAKE_BINARY_DIR})
+set(CMAKE_INCLUDE_CURRENT_DIR 1)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)
 
 set(BOOST_USE_STATIC_LIBS OFF)
@@ -71,7 +72,7 @@ find_package(Boost ${BOOST_MINVERSION})
 if(NOT Boost_FOUND)
     # we might have boost in tree, so provide a hint and try again
     message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${CMAKE_SOURCE_DIR}/include")
+    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
     find_package(Boost ${BOOST_MINVERSION})
     if(NOT Boost_FOUND)
         message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
@@ -219,6 +220,15 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
 
+if (RELEASE_BUILD)
+    if (HAS_C_HIDDEN)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
+    endif()
+    if (HAS_CXX_HIDDEN)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
+    endif()
+endif()
+
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@@ -327,8 +337,8 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
 endif()
 
 # do substitutions
-configure_file(${CMAKE_MODULE_PATH}/config.h.in ${CMAKE_BINARY_DIR}/config.h)
-configure_file(src/hs_version.h.in hs_version.h)
+configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
+configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
 
 if (PKG_CONFIG_FOUND)
     # we really only need to do this if we have pkg-config
@@ -345,7 +355,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 # include the autogen targets
 add_subdirectory(src/fdr)
 
-include_directories(${CMAKE_BINARY_DIR}/src/fdr)
+include_directories(${PROJECT_BINARY_DIR}/src/fdr)
 
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
diff --git a/README.md b/README.md
index 37fc38ce..1753ecbe 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,24 @@ the [Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/)
 Hyperscan is licensed under the BSD License. See the LICENSE file in the
 project repository.
 
+# Versioning
+
+The `master` branch on Github will always contain the most recent release of
+Hyperscan. Each version released to `master` goes through QA and testing before
+it is released; if you're a user, rather than a developer, this is the version
+you should be using.
+
+Further development towards the next release takes place on the `develop`
+branch.
+
+# Get Involved
+
+The official homepage for Hyperscan is at [01.org/hyperscan](https://01.org/hyperscan).
+
+If you have questions or comments, we encourage you to [join the mailing
+list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
+sending email to the list, or by creating an issue on Github.
+
+If you wish to contact the Hyperscan team at Intel directly, without posting
+publicly to the mailing list, send email to
+[hyperscan@intel.com](mailto:hyperscan@intel.com).
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 6e195f6a..f3723dc9 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -63,6 +63,9 @@ described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 
+The version of PCRE used to validate Hyperscan's interpretation of this syntax
+is 8.38.
+
 ====================
 Supported Constructs
 ====================
diff --git a/examples/simplegrep.c b/examples/simplegrep.c
index 2fe6e6f3..9e392a8f 100644
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@@ -109,7 +109,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
      * limit the size of our buffer appropriately. */
     if ((unsigned long)dataLen > UINT_MAX) {
         dataLen = UINT_MAX;
-        printf("WARNING: clipping data to %lu bytes\n", dataLen);
+        printf("WARNING: clipping data to %ld bytes\n", dataLen);
     } else if (dataLen == 0) {
         fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
         fclose(f);
@@ -118,7 +118,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
 
     char *inputData = malloc(dataLen);
     if (!inputData) {
-        fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
+        fprintf(stderr, "ERROR: unable to malloc %ld bytes\n", dataLen);
         fclose(f);
         return NULL;
     }
diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt
index 25396689..1436c3fc 100644
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@@ -27,11 +27,11 @@ fdr_autogen(teddy_runtime teddy_autogen.c)
 fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
 
 set(fdr_GENERATED_SRC
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
-PARENT_SCOPE)
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+    PARENT_SCOPE)
 
 set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py
index 36e4c16c..e5b4f39e 100755
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
 
 def build_fdr_matchers():
     all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
-    big_domains = [ 14, 15 ]
+    strides = [ 1, 2, 4 ]
 
     common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
-        all_matchers += [ M3(stride = 2, domain = d, **common) ]
-        all_matchers += [ M3(stride = 4, domain = d, **common) ]
-    for d in big_domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+    for s in strides:
+        all_matchers += [ M3(stride = s, **common) ]
 
     return all_matchers
 
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 082800f1..f83a4265 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -40,27 +40,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_streaming_runtime.h"
 #include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
-    u32 r = 0;
-    if (a->start_offset == 0) {
-        if (numBits <= 8) {
-            r = a->buf_history[a->len_history - 1];
-        } else {
-            r = a->buf_history[a->len_history - 1];
-            r |= (a->buf[0] << 8);
-        }
-    } else {
-        if (numBits <= 8) {
-            r = a->buf[a->start_offset - 1];
-        } else {
-            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
-        }
-    }
-    return r & ((1 << numBits) - 1);
-}
-
 #include "fdr_autogen.c"
 
 #define FAKE_HISTORY_SIZE 16
diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py
index 685cca3b..748d811f 100755
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@@ -74,12 +74,12 @@ class ValueExtractStep(Step):
         dsb = m.datasize_bytes
         modval = offset % dsb
 
-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
             # Case 1: reading more than one byte over the end of the bulk load
 
             self.latency = 4
             if sub_load_cautious:
-                code_string = "cautious_forward" 
+                code_string = "cautious_forward"
             else:
                 code_string = "normal"
             load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                     temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
 
 
-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
         v_var = self.nv(m.value_extract_type, "v%d" % offset)
         self.val = v_var.gen_initializer_stmt(init_string)
 
@@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                           enable_confirmless = m.stride == 1, do_bailout = False)
 
 class M3(MatcherBase):
-    def get_hash_safety_parameters(self):
-        h_size = self.single_load_type.size_in_bytes()
-        return (0, h_size - 1)
-
     def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
               self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
               self.arch.target, self.conf_pull_back, self.conf_top_level_split)
 
     def produce_main_loop(self, switch_variant = False):
@@ -192,8 +188,8 @@ class M3(MatcherBase):
         ctxt = CodeGenContext(self)
 
         if switch_variant:
-            print " ptr -= (iterBytes - dist);"
-            print " { " # need an extra scope around switch variant to stop its globals escaping
+            print "    ptr -= (iterBytes - dist);"
+            print "    { " # need an extra scope around switch variant to stop its globals escaping
         else:
             print "    if (doMainLoop) {"
             print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@@ -349,25 +345,30 @@ class M3(MatcherBase):
         shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
 
         s = Template("""
-            $TYPENAME s;
-            if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
-                s = *((const $TYPENAME *)ft + tmp);
-                $SHIFT_EXPR;
-            } else {
-                s = *(const $TYPENAME *)&fdr->start;
-            }
+    $TYPENAME s;
+    if (a->len_history) {
+        u32 tmp = 0;
+        if (a->start_offset == 0) {
+            tmp = a->buf_history[a->len_history - 1];
+            tmp |= (a->buf[0] << 8);
+        } else {
+            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+        tmp &= fdr->domainMask;
+        s = *((const $TYPENAME *)ft + tmp);
+        $SHIFT_EXPR;
+    } else {
+        s = *(const $TYPENAME *)&fdr->start;
+    }
 """).substitute(TYPENAME = s_type.get_name(),
                 ZERO_EXPR = s_type.zero_expression(),
-                DOMAIN = self.domain,
                 SHIFT_EXPR = shift_expr)
         return s
 
     def produce_code(self):
 
-        (behind, ahead) = self.get_hash_safety_parameters()
-        loop_read_behind = behind
-        loop_read_ahead = self.loop_bytes + ahead
+        loop_read_behind = 0
+        loop_read_ahead = self.loop_bytes + 1
 
         # we set up mask and shift stuff for extracting our masks from registers
         #
@@ -380,7 +381,7 @@ class M3(MatcherBase):
         ssb = self.state_type.size / 8 # state size in bytes
 
         # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
             # obscure corner - we don't have the room in the register to
             # do this for all values so we don't. domain==16 is pretty
             # bad anyhow, of course
@@ -390,7 +391,6 @@ class M3(MatcherBase):
 
         shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
         self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
-        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
 
         print self.produce_header(visible = False)
 
@@ -398,21 +398,19 @@ class M3(MatcherBase):
         print " Arch: " + self.arch.name,
         print " State type: " + self.state_type.get_name(),
         print " Num buckets: %d" % self.num_buckets,
-        print " Domain: %d" % self.domain,
         print " Stride: %d" % self.stride
 
         print self.produce_common_declarations()
-        print
 
-        print "\tconst size_t tabSize = %d;" % self.table_size
-        print """
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
+        print
+        print "    u64a domain_mask = fdr->domainMask;"
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
         print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
 
         print """
     while (ptr < buf + len) {
@@ -451,9 +449,9 @@ class M3(MatcherBase):
         print self.produce_footer()
 
     def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
 
-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                  arch,
                  table_state_width = None,
                  num_buckets = 8,
@@ -474,17 +472,9 @@ class M3(MatcherBase):
         self.table_state_width = state_width
         self.table_state_type = getRequiredType(self.table_state_width)
 
-        # domain is the number of bits that we draw from our input to
-        # index our 'reach' table
-        if not 8 <= domain <= 16:
-            fail_out("Unsupported domain: %d" % domain)
-        self.domain = domain
-        # this is the load type required for this domain if we want to
+        # this is the load type required for domain [9:15] if we want to
         # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
-
-        # table size
-        self.table_size = 2**domain * table_state_width // 8
+        self.single_load_type = IntegerType(16)
 
         # stride is the frequency with which we make data-driven
         # accesses to our reach table
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8d658ccd..ccf17626 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     ptr += floodControlTmp.second;
     aligned_free(floodControlTmp.first);
 
+    /*  we are allowing domains 9 to 15 only */
+    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->domain = eng.bits;
+    fdr->schemeWidthByte = eng.schemeWidth / 8;
+    fdr->domainMask = (1 << eng.bits) - 1;
+    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
     if (link.first) {
         fdr->link = verify_u32(ptr - fdr_base);
         memcpy(ptr, link.first, link.second);
@@ -245,6 +252,8 @@ void FDRCompiler::assignStringsToBuckets() {
     typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
 
     u32 ls = verify_u32(lits.size());
+    assert(ls); // Shouldn't be called with no literals.
+
     // make a vector that contains our literals as pointers or u32 LiteralIndex values
     vector<LiteralIndex> vli;
     vli.resize(ls);
@@ -292,6 +301,8 @@ void FDRCompiler::assignStringsToBuckets() {
             currentChunk++;
         }
     }
+
+    assert(currentChunk > 0);
     count[currentChunk - 1] = ls - chunkStartID;
     // close off chunks with an empty row
     firstIds[currentChunk] = ls;
@@ -383,12 +394,14 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                                const vector<hwlmLiteral> &lits,
                                SuffixPositionInString pos,
                                std::map<u32, ue2::unordered_set<u32> > &m2) {
+    assert(eng.bits < 32);
+
     u32 distance = 0;
     if (eng.bits <= 8) {
         distance = 1;
     } else if (eng.bits <= 16) {
         distance = 2;
-    } else if (eng.bits <= 32) {
+    } else {
         distance = 4;
     }
 
@@ -528,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
         return nullptr;
     }
 
+    // temporary hack for unit testing
+    if (hint != HINT_INVALID) {
+        des->bits = 9;
+    }
+
     FDRCompiler fc(lits, *des, make_small);
     return fc.build(link);
 }
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index ae246270..158170c2 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
         unique_ptr<FDREngineDescription> des =
             getFdrDescription(fdr->engineID);
         if (des) {
+            fprintf(f, "    domain     %u\n", des->bits);
             fprintf(f, "    stride     %u\n", des->stride);
             fprintf(f, "    buckets    %u\n", des->getNumBuckets());
             fprintf(f, "    width      %u\n", des->schemeWidth);
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 2a6fda79..5d470c7e 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
     : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                         def.numBuckets, def.confirmPullBackDistance,
                         def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
 
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
     // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
     DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                  desiredStride);
 
-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
     u32 best_score = 0;
 
-    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
-        if (!eng.isValidOnTarget(target)) {
-            continue;
-        }
-        if (msl < eng.stride) {
-            continue;
-        }
-
-        u32 score = 100;
-
-        score -= absdiff(desiredStride, eng.stride);
-
-        if (eng.stride <= desiredStride) {
-            score += eng.stride;
-        }
-
-        u32 effLits = vl.size(); /* * desiredStride;*/
-        u32 ideal;
-        if (effLits < eng.getNumBuckets()) {
-            if (eng.stride == 1) {
-                ideal = 8;
-            } else {
-                ideal = 10;
+    for (u32 domain = 9; domain <= 15; domain++) {
+        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+            // to make sure that domains >=14 have stride 1 according to origin
+            if (domain > 13 && engineID > 0) {
+                continue;
+            }
+            FDREngineDescription &eng = allDescs[engineID];
+            if (!eng.isValidOnTarget(target)) {
+                continue;
+            }
+            if (msl < eng.stride) {
+                continue;
             }
-        } else if (effLits < 20) {
-            ideal = 10;
-        } else if (effLits < 100) {
-            ideal = 11;
-        } else if (effLits < 1000) {
-            ideal = 12;
-        } else if (effLits < 10000) {
-            ideal = 13;
-        } else {
-            ideal = 15;
-        }
 
-        if (ideal != 8 && eng.schemeWidth == 32) {
-            ideal += 1;
-        }
+            u32 score = 100;
 
-        if (make_small) {
-            ideal -= 2;
-        }
+            score -= absdiff(desiredStride, eng.stride);
 
-        if (eng.stride > 1) {
-            ideal++;
-        }
+            if (eng.stride <= desiredStride) {
+                score += eng.stride;
+            }
 
-        DEBUG_PRINTF("effLits %u\n", effLits);
+            u32 effLits = vl.size(); /* * desiredStride;*/
+            u32 ideal;
+            if (effLits < eng.getNumBuckets()) {
+                if (eng.stride == 1) {
+                    ideal = 8;
+                } else {
+                    ideal = 10;
+                }
+            } else if (effLits < 20) {
+                ideal = 10;
+            } else if (effLits < 100) {
+                ideal = 11;
+            } else if (effLits < 1000) {
+                ideal = 12;
+            } else if (effLits < 10000) {
+                ideal = 13;
+            } else {
+                ideal = 15;
+            }
 
-        if (target.is_atom_class() && !make_small && effLits < 4000) {
-            /* Unless it is a very heavy case, we want to build smaller tables
-             * on lightweight machines due to their small caches. */
-            ideal -= 2;
-        }
+            if (ideal != 8 && eng.schemeWidth == 32) {
+                ideal += 1;
+            }
 
-        score -= absdiff(ideal, eng.bits);
+            if (make_small) {
+                ideal -= 2;
+            }
 
-        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
-                     "-> score=%u\n",
-                     eng.getID(), eng.schemeWidth, eng.bits,
-                     eng.getNumBuckets(), eng.stride, score);
+            if (eng.stride > 1) {
+                ideal++;
+            }
 
-        if (!best || score > best_score) {
-            best = &eng;
-            best_score = score;
+            DEBUG_PRINTF("effLits %u\n", effLits);
+
+            if (target.is_atom_class() && !make_small && effLits < 4000) {
+                /* Unless it is a very heavy case, we want to build smaller tables
+                 * on lightweight machines due to their small caches. */
+                ideal -= 2;
+            }
+
+            score -= absdiff(ideal, domain);
+
+            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                         "-> score=%u\n",
+                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getNumBuckets(), eng.stride, score);
+
+            if (!best || score > best_score) {
+                eng.bits = domain;
+                best = &eng;
+                best_score = score;
+            }
         }
     }
 
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h
index d936095b..45f64ac0 100644
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -43,7 +43,6 @@ struct FDREngineDef {
     u32 schemeWidth;
     u32 numBuckets;
     u32 stride;
-    u32 bits;
     u64a cpu_features;
     u32 confirmPullBackDistance;
     u32 confirmTopLevelSplit;
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index 6c722777..607e039c 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -76,9 +76,11 @@ struct FDR {
      * structures (spillover strings and hash table) if we're a secondary
      * structure. */
     u32 link;
+    u8 domain; /* dynamic domain info */
+    u8 schemeWidthByte;  /* scheme width in bytes */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
     u32 pad1;
-    u32 pad2;
-    u32 pad3;
 
     union {
         u32 s_u32;
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 5ea0e873..e5cc9267 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -58,11 +58,13 @@
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
+using boost::adaptors::map_keys;
 using boost::adaptors::map_values;
 
 namespace ue2 {
 
 #define CASTLE_MAX_TOPS 32
+#define CLIQUE_GRAPH_MAX_SIZE 1000
 
 static
 u32 depth_to_u32(const depth &d) {
@@ -106,51 +108,35 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
 }
 
 static
-size_t literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b) {
+bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
+                    const size_t dist) {
     for (size_t i = 0; i < b.size(); i++) {
+        if (i > dist) {
+            return true;
+        }
         size_t overlap_len = b.size() - i;
         if (overlap_len <= a.size()) {
             if (matches(a.end() - overlap_len, a.end(), b.begin(),
                         b.end() - i)) {
-                return i;
+                return false;
             }
         } else {
             assert(overlap_len > a.size());
             if (matches(a.begin(), a.end(), b.end() - i - a.size(),
                         b.end() - i)) {
-                return i;
+                return false;
             }
         }
     }
 
-    return b.size();
+    return b.size() > dist;
 }
 
-//  UE-2666 case 1: The problem of find largest exclusive subcastles group
-//  can be reformulated as finding the largest clique (subgraph where every
-//  vertex is connected to every other vertex) in the graph. We use an
-//  approximate algorithm here to find the maximum clique.
-//  References
-//  ----------
-//      [1] Boppana, R., & Halldórsson, M. M. (1992).
-//      Approximating maximum independent sets by excluding subgraphs.
-//      BIT Numerical Mathematics, 32(2), 180–196. Springer.
-//      doi:10.1007/BF01994876
-//  ----------
-
 struct CliqueVertexProps {
     CliqueVertexProps() {}
     explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
 
     u32 stateId = ~0U;
-    u32 parentId = ~0U;
-    bool leftChild = false; /* tells us if it is the left child of its parent */
-    bool rightChildVisited = false; /* tells us if its right child is visited */
-
-    vector<u32> clique1; /* clique for the left branch */
-    vector<u32> indepSet1; /* independent set for the left branch */
-    vector<u32> clique2; /* clique for the right branch */
-    vector<u32> indepSet2; /* independent set for the right branch */
 };
 
 typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
@@ -158,181 +144,54 @@ typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
 typedef CliqueGraph::vertex_descriptor CliqueVertex;
 
 static
-unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
-    u32 size = exclusiveSet.size();
-
-    vector<CliqueVertex> vertices;
-    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-        vertices.push_back(v);
-    }
-
-    // construct the complement graph, then its maximum independent sets
-    // are equal to the maximum clique of the original graph
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex s = vertices[i];
-        vector<u32> complement(size, 0);
-        for (u32 j = 0; j < exclusiveSet[i].size(); ++j) {
-            u32 val = exclusiveSet[i][j];
-            complement[val] = 1;
-        }
-
-        for (u32 k = i + 1; k < size; ++k) {
-             if (!complement[k]) {
-                CliqueVertex d = vertices[k];
-                add_edge(s, d, *cg);
-             }
-        }
-    }
-    return cg;
-}
-
-static
-CliqueGraph createSubgraph(const CliqueGraph &cg,
-                           const vector<CliqueVertex> &vertices) {
-    CliqueGraph g;
-    map<u32, CliqueVertex> vertexMap;
-    for (auto u : vertices) {
-        u32 id = cg[u].stateId;
-        CliqueVertex v = add_vertex(CliqueVertexProps(id), g);
-        vertexMap[id] = v;
-    }
-
-    set<u32> found;
-    for (auto u : vertices) {
-        u32 srcId = cg[u].stateId;
-        CliqueVertex src = vertexMap[srcId];
-        found.insert(srcId);
-        for (auto n : adjacent_vertices_range(u, cg)) {
-            u32 dstId = cg[n].stateId;
-            if (found.find(dstId) == found.end() &&
-                vertexMap.find(dstId) != vertexMap.end()) {
-                CliqueVertex dst = vertexMap[dstId];
-                add_edge(src, dst, g);
-            }
-        }
-    }
-    return g;
-}
-
-static
-void getNeighborInfo(const CliqueGraph &g, vector<CliqueVertex> &neighbor,
-                     vector<CliqueVertex> &nonneighbor,
-                     const CliqueVertex &cv) {
+void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
+                     const CliqueVertex &cv, const set<u32> &group) {
     u32 id = g[cv].stateId;
     ue2::unordered_set<u32> neighborId;
 
     // find neighbors for cv
-    for (auto v : adjacent_vertices_range(cv, g)) {
-        neighbor.push_back(v);
-        neighborId.insert(g[v].stateId);
-    }
-
-    // find non-neighbors for cv
-    for (auto v : vertices_range(g)) {
-        if (g[v].stateId != id &&
-            neighborId.find(g[v].stateId) == neighborId.end()) {
-            nonneighbor.push_back(v);
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
+            neighbor.push_back(g[v].stateId);
+            neighborId.insert(g[v].stateId);
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
         }
     }
 }
 
 static
-void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
-                      vector<u32> &clique, vector<u32> &indepSet) {
-    u32 id = cg[n].stateId;
-    if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) {
-        cg[n].clique1.push_back(id);
-        clique.swap(cg[n].clique1);
-    } else {
-        clique.swap(cg[n].clique2);
-    }
+void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
+    stack<vector<u32>> gStack;
 
-    if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) {
-        cg[n].indepSet2.push_back(id);
-        indepSet.swap(cg[n].indepSet2);
-    } else {
-        indepSet.swap(cg[n].indepSet1);
-    }
-}
-
-static
-void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique,
-                     vector<u32> &indepSet) {
-    stack<CliqueGraph> gStack;
-    gStack.push(cg);
-
-    // create mapping between vertex and id
+    // Create mapping between vertex and id
     map<u32, CliqueVertex> vertexMap;
-    for (auto v : vertices_range(cg)) {
+    vector<u32> init;
+    for (const auto &v : vertices_range(cg)) {
         vertexMap[cg[v].stateId] = v;
+        init.push_back(cg[v].stateId);
     }
+    gStack.push(init);
 
-    // get the vertex to start from
-    ue2::unordered_set<u32> foundVertexId;
+    // Get the vertex to start from
     CliqueGraph::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(cg);
-    CliqueVertex start = *vi;
-    u32 startId = cg[start].stateId;
-
-    bool leftChild = false;
-    u32 prevId = startId;
     while (!gStack.empty()) {
-        CliqueGraph g = gStack.top();
+        vector<u32> g = gStack.top();
         gStack.pop();
 
-        // choose a vertex from the graph
-        tie(vi, ve) = vertices(g);
-        CliqueVertex cv = *vi;
-        u32 id = g[cv].stateId;
-
-        // corresponding vertex in the original graph
-        CliqueVertex n = vertexMap.at(id);
-
-        vector<CliqueVertex> neighbor;
-        vector<CliqueVertex> nonneighbor;
-        getNeighborInfo(g, neighbor, nonneighbor, cv);
-
-        if (foundVertexId.find(id) != foundVertexId.end()) {
-            prevId = id;
-            // get graph consisting of non-neighbors for right branch
-            if (!cg[n].rightChildVisited) {
-                gStack.push(g);
-                if (!nonneighbor.empty()) {
-                    const CliqueGraph &nSub = createSubgraph(g, nonneighbor);
-                    gStack.push(nSub);
-                    leftChild = false;
-                }
-                cg[n].rightChildVisited = true;
-            } else if (id != startId) {
-                // both the left and right branches are visited,
-                // update its parent's clique and independent sets
-                u32 parentId = cg[n].parentId;
-                CliqueVertex parent = vertexMap.at(parentId);
-                if (cg[n].leftChild) {
-                    updateCliqueInfo(cg, n, cg[parent].clique1,
-                        cg[parent].indepSet1);
-                } else {
-                    updateCliqueInfo(cg, n, cg[parent].clique2,
-                        cg[parent].indepSet2);
-                }
-            }
-        } else {
-            foundVertexId.insert(id);
-            g[n].leftChild = leftChild;
-            g[n].parentId = prevId;
-            gStack.push(g);
-            // get graph consisting of neighbors for left branch
-            if (!neighbor.empty()) {
-                const CliqueGraph &sub = createSubgraph(g, neighbor);
-                gStack.push(sub);
-                leftChild = true;
-            }
-            prevId = id;
+        // Choose a vertex from the graph
+        u32 id = g[0];
+        const CliqueVertex &n = vertexMap.at(id);
+        clique.push_back(id);
+        // Corresponding vertex in the original graph
+        vector<u32> neighbor;
+        set<u32> subgraphId(g.begin(), g.end());
+        getNeighborInfo(cg, neighbor, n, subgraphId);
+        // Get graph consisting of neighbors for left branch
+        if (!neighbor.empty()) {
+            gStack.push(neighbor);
         }
     }
-    updateCliqueInfo(cg, start, clique, indepSet);
 }
 
 template<typename Graph>
@@ -345,18 +204,17 @@ bool graph_empty(const Graph &g) {
 static
 vector<u32> removeClique(CliqueGraph &cg) {
     vector<vector<u32>> cliquesVec(1);
-    vector<vector<u32>> indepSetsVec(1);
     DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
-    findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]);
+    findCliqueGroup(cg, cliquesVec[0]);
     while (!graph_empty(cg)) {
         const vector<u32> &c = cliquesVec.back();
         vector<CliqueVertex> dead;
-        for (auto v : vertices_range(cg)) {
+        for (const auto &v : vertices_range(cg)) {
             if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
                 dead.push_back(v);
             }
         }
-        for (auto v : dead) {
+        for (const auto &v : dead) {
             clear_vertex(v, cg);
             remove_vertex(v, cg);
         }
@@ -364,30 +222,22 @@ vector<u32> removeClique(CliqueGraph &cg) {
             break;
         }
         vector<u32> clique;
-        vector<u32> indepSet;
-        findCliqueGroup(cg, clique, indepSet);
+        findCliqueGroup(cg, clique);
         cliquesVec.push_back(clique);
-        indepSetsVec.push_back(indepSet);
     }
 
     // get the independent set with max size
     size_t max = 0;
     size_t id = 0;
-    for (size_t j = 0; j < indepSetsVec.size(); ++j) {
-        if (indepSetsVec[j].size() > max) {
-            max = indepSetsVec[j].size();
+    for (size_t j = 0; j < cliquesVec.size(); ++j) {
+        if (cliquesVec[j].size() > max) {
+            max = cliquesVec[j].size();
             id = j;
         }
     }
 
-    DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size());
-    return indepSetsVec[id];
-}
-
-static
-vector<u32> findMaxClique(const vector<vector<u32>> &exclusiveSet) {
-    auto cg = makeCG(exclusiveSet);
-    return removeClique(*cg);
+    DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
+    return cliquesVec[id];
 }
 
 // if the location of any reset character in one literal are after
@@ -401,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2,
     const auto &triggers2 = triggers[id2];
     for (u32 i = 0; i < triggers1.size(); ++i) {
         for (u32 j = 0; j < triggers2.size(); ++j) {
-            size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]);
-            size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]);
-            if (max_overlap1 <= min_reset_dist[id2][j] ||
-                max_overlap2 <= min_reset_dist[id1][i]) {
+            if (!literalOverlap(triggers1[i], triggers2[j],
+                                min_reset_dist[id2][j]) ||
+                !literalOverlap(triggers2[j], triggers1[i],
+                                min_reset_dist[id1][i])) {
                 return false;
             }
         }
@@ -420,28 +270,33 @@ vector<u32> checkExclusion(const CharReach &cr,
         return group;
     }
 
-    vector<vector<size_t> > min_reset_dist;
+    vector<vector<size_t>> min_reset_dist;
     // get min reset distance for each repeat
     for (auto it = triggers.begin(); it != triggers.end(); it++) {
         const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr);
         min_reset_dist.push_back(tmp_dist);
     }
 
-    vector<vector<u32>> exclusiveSet;
+    vector<CliqueVertex> vertices;
+    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+    for (u32 i = 0; i < triggers.size(); ++i) {
+        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
+        vertices.push_back(v);
+    }
+
     // find exclusive pair for each repeat
     for (u32 i = 0; i < triggers.size(); ++i) {
-        vector<u32> repeatIds;
+        CliqueVertex s = vertices[i];
         for (u32 j = i + 1; j < triggers.size(); ++j) {
             if (findExclusivePair(i, j, min_reset_dist, triggers)) {
-                repeatIds.push_back(j);
+                CliqueVertex d = vertices[j];
+                add_edge(s, d, *cg);
             }
         }
-        exclusiveSet.push_back(repeatIds);
-        DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size());
     }
 
     // find the largest exclusive group
-    return findMaxClique(exclusiveSet);
+    return removeClique(*cg);
 }
 
 static
@@ -599,7 +454,7 @@ buildCastle(const CastleProto &proto,
 
         repeatInfoPair.push_back(make_pair(min_period, is_reset));
 
-        if (is_reset) {
+        if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
             candidateTriggers.push_back(triggers.at(top));
             candidateRepeats.push_back(i);
         }
@@ -608,7 +463,7 @@ buildCastle(const CastleProto &proto,
     // Case 1: exclusive repeats
     bool exclusive = false;
     bool pureExclusive = false;
-    u8 activeIdxSize = 0;
+    u32 activeIdxSize = 0;
     set<u32> exclusiveGroup;
     if (cc.grey.castleExclusive) {
         vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
@@ -617,7 +472,7 @@ buildCastle(const CastleProto &proto,
             // Case 1: mutual exclusive repeats group found, initialize state
             // sizes
             exclusive = true;
-            activeIdxSize = calcPackedBytes(exclusiveSize);
+            activeIdxSize = calcPackedBytes(numRepeats + 1);
             if (exclusiveSize == numRepeats) {
                 pureExclusive = true;
                 streamStateSize = 0;
@@ -665,7 +520,7 @@ buildCastle(const CastleProto &proto,
     c->numRepeats = verify_u32(subs.size());
     c->exclusive = exclusive;
     c->pureExclusive = pureExclusive;
-    c->activeIdxSize = activeIdxSize;
+    c->activeIdxSize = verify_u8(activeIdxSize);
 
     writeCastleScanEngine(cr, c);
 
@@ -710,8 +565,8 @@ buildCastle(const CastleProto &proto,
 
 set<ReportID> all_reports(const CastleProto &proto) {
     set<ReportID> reports;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        reports.insert(pr.reports.begin(), pr.reports.end());
+    for (const ReportID &report : proto.report_map | map_keys) {
+        reports.insert(report);
     }
     return reports;
 }
@@ -732,10 +587,30 @@ depth findMaxWidth(const CastleProto &proto) {
     return max_width;
 }
 
+depth findMinWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth::infinity();
+    }
+    return proto.repeats.at(top).bounds.min;
+}
+
+depth findMaxWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth(0);
+    }
+    return proto.repeats.at(top).bounds.max;
+}
+
 CastleProto::CastleProto(const PureRepeat &pr) {
     assert(pr.reach.any());
     assert(pr.reports.size() == 1);
-    repeats.insert(make_pair(0, pr));
+    u32 top = 0;
+    repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
 }
 
 const CharReach &CastleProto::reach() const {
@@ -743,25 +618,29 @@ const CharReach &CastleProto::reach() const {
     return repeats.begin()->second.reach;
 }
 
-static
-u32 find_next_top(const map<u32, PureRepeat> &repeats) {
-    u32 top = 0;
-    for (; contains(repeats, top); top++) {
-        // pass
-    }
-    return top;
-}
-
 u32 CastleProto::add(const PureRepeat &pr) {
     assert(repeats.size() < max_occupancy);
     assert(pr.reach == reach());
     assert(pr.reports.size() == 1);
-    u32 top = find_next_top(repeats);
+    u32 top = next_top++;
     DEBUG_PRINTF("selected unused top %u\n", top);
-    repeats.insert(make_pair(top, pr));
+    assert(!contains(repeats, top));
+    repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
     return top;
 }
 
+void CastleProto::erase(u32 top) {
+    DEBUG_PRINTF("erase top %u\n", top);
+    assert(contains(repeats, top));
+    repeats.erase(top);
+    for (auto &m : report_map) {
+        m.second.erase(top);
+    }
+}
+
 u32 CastleProto::merge(const PureRepeat &pr) {
     assert(repeats.size() <= max_occupancy);
     assert(pr.reach == reach());
@@ -806,8 +685,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
         DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = find_next_top(c1.repeats);
-        c1.repeats.insert(make_pair(new_top, pr));
+        u32 new_top = c1.add(pr);
         top_map[top] = new_top;
         DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
     }
@@ -823,12 +701,23 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
     for (const auto &m : proto.repeats) {
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
-        u32 new_top = find_next_top(out);
-        out.insert(make_pair(new_top, pr));
+        u32 new_top = out.size();
+        out.emplace(new_top, pr);
         top_map[top] = new_top;
     }
 
     proto.repeats.swap(out);
+
+    // Remap report map.
+    proto.report_map.clear();
+    for (const auto &m : proto.repeats) {
+        const u32 top = m.first;
+        const PureRepeat &pr = m.second;
+        for (const auto &report : pr.reports) {
+            proto.report_map[report].insert(top);
+        }
+    }
+
     assert(proto.repeats.size() <= proto.max_occupancy);
 }
 
@@ -904,18 +793,17 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
     return c1.repeats == c2.repeats;
 }
 
-bool requiresDedupe(const CastleProto &proto, const set<ReportID> &reports) {
-    ue2::unordered_set<ReportID> seen;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        for (const ReportID &report : pr.reports) {
-            if (contains(reports, report)) {
-                if (contains(seen, report)) {
-                    DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
-                                 report);
-                    return true;
-                }
-                seen.insert(report);
-            }
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports) {
+    for (const auto &report : reports) {
+        auto it = proto.report_map.find(report);
+        if (it == end(proto.report_map)) {
+            continue;
+        }
+        if (it->second.size() > 1) {
+            DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
+                         report);
+            return true;
         }
     }
     return false;
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index fbafb606..fc4bb991 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -38,6 +38,7 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/alloc.h"
 #include "util/depth.h"
+#include "util/ue2_containers.h"
 
 #include <map>
 #include <memory>
@@ -67,8 +68,12 @@ struct CastleProto {
     explicit CastleProto(const PureRepeat &pr);
     const CharReach &reach() const;
 
+    /** \brief Add a new repeat. */
     u32 add(const PureRepeat &pr);
 
+    /** \brief Remove a repeat. */
+    void erase(u32 top);
+
     /**
      * \brief Merge in the given repeat, returning the top used.
      *
@@ -80,11 +85,22 @@ struct CastleProto {
 
     /** \brief Mapping from unique top id to repeat. */
     std::map<u32, PureRepeat> repeats;
+
+    /** \brief Mapping from report to associated tops. */
+    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
+
+    /**
+     * \brief Next top id to use. Repeats may be removed without top remapping,
+     * so we track this explicitly instead of using repeats.size().
+     */
+    u32 next_top = 1;
 };
 
 std::set<ReportID> all_reports(const CastleProto &proto);
 depth findMinWidth(const CastleProto &proto);
 depth findMaxWidth(const CastleProto &proto);
+depth findMinWidth(const CastleProto &proto, u32 top);
+depth findMaxWidth(const CastleProto &proto, u32 top);
 
 /**
  * \brief Remap tops to be contiguous.
@@ -133,7 +149,8 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
  * \brief True if the given castle contains more than a single instance of any
  * of the reports in the given set.
  */
-bool requiresDedupe(const CastleProto &proto, const std::set<ReportID> &reports);
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports);
 
 /**
  * \brief Build an NGHolder from a CastleProto.
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 92d09aa5..d735c80a 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -1109,7 +1109,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     u32 total_prog_size = byte_length(temp_blocks);
     curr_offset += total_prog_size;
 
-    gi.stream_som_loc_count =  slot_count;
+    gi.stream_som_loc_count = slot_count;
     gi.stream_som_loc_width = somPrecision;
 
     u32 gough_size = ROUNDUP_N(curr_offset, 16);
@@ -1136,16 +1136,11 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     gough_dfa->length = gough_size;
 
     /* copy in blocks */
-    memcpy((u8 *)gough_dfa.get() + edge_prog_offset, &edge_blocks[0],
-           byte_length(edge_blocks));
+    copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
     if (top_prog_offset) {
-        memcpy((u8 *)gough_dfa.get() + top_prog_offset, &top_blocks[0],
-               byte_length(top_blocks));
-    }
-    if (!temp_blocks.empty()) {
-        memcpy((u8 *)gough_dfa.get() + prog_base_offset, &temp_blocks[0],
-               byte_length(temp_blocks));
+        copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
     }
+    copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);
 
     return gough_dfa;
 }
diff --git a/src/nfa/goughcompile.h b/src/nfa/goughcompile.h
index b575c75b..9da983d4 100644
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@@ -70,8 +70,11 @@ struct dstate_som {
 };
 
 struct raw_som_dfa : public raw_dfa {
-    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in)
-        : raw_dfa(k), unordered_som_triggers(unordered_som_triggers_in) {
+    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in, u32 trigger,
+                u32 stream_som_loc_width_in)
+        : raw_dfa(k), stream_som_loc_width(stream_som_loc_width_in),
+        unordered_som_triggers(unordered_som_triggers_in),
+        trigger_nfa_state(trigger) {
         assert(!unordered_som_triggers || is_triggered(kind));
     }
 
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 5cf46334..a6c34cb6 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1397,8 +1397,7 @@ struct Factory {
             repeat->horizon = rsi.horizon;
             repeat->packedCtrlSize = rsi.packedCtrlSize;
             repeat->stateSize = rsi.stateSize;
-            memcpy(repeat->packedFieldSizes, rsi.packedFieldSizes.data(),
-                   byte_length(rsi.packedFieldSizes));
+            copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
             repeat->patchCount = rsi.patchCount;
             repeat->patchSize = rsi.patchSize;
             repeat->encodingSize = rsi.encodingSize;
@@ -1413,8 +1412,7 @@ struct Factory {
             // Copy in the sparse lookup table.
             if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
                 assert(!rsi.table.empty());
-                memcpy(info_ptr + tableOffset, rsi.table.data(),
-                       byte_length(rsi.table));
+                copy_bytes(info_ptr + tableOffset, rsi.table);
             }
 
             // Fill the tug mask.
@@ -1702,6 +1700,7 @@ struct Factory {
 
         for (u32 i = 0; i < num_repeats; i++) {
             repeatOffsets[i] = offset;
+            assert(repeats[i].first);
             memcpy((char *)limex + offset, repeats[i].first.get(),
                    repeats[i].second);
             offset += repeats[i].second;
@@ -1709,8 +1708,7 @@ struct Factory {
 
         // Write repeat offset lookup table.
         assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
-        memcpy((char *)limex + repeatOffsetsOffset, repeatOffsets.data(),
-               byte_length(repeatOffsets));
+        copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
 
         limex->repeatOffset = repeatOffsetsOffset;
         limex->repeatCount = num_repeats;
@@ -1725,8 +1723,7 @@ struct Factory {
         limex->exReportOffset = exceptionReportsOffset;
         assert(ISALIGNED_N((char *)limex + exceptionReportsOffset,
                            alignof(ReportID)));
-        memcpy((char *)limex + exceptionReportsOffset, reports.data(),
-               byte_length(reports));
+        copy_bytes((char *)limex + exceptionReportsOffset, reports);
     }
 
     static
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index b836b63b..084f35dd 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -317,7 +317,7 @@ template<typename limex_type>
 struct limex_labeller : public nfa_labeller {
     explicit limex_labeller(const limex_type *limex_in) : limex(limex_in) {}
 
-    void label_state(FILE *f, u32 state) const {
+    void label_state(FILE *f, u32 state) const override {
         const typename limex_traits<limex_type>::exception_type *exceptions
             = getExceptionTable(limex);
         if (!testbit((const u8 *)&limex->exceptionMask,
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index dcc302d4..26c5e5a5 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -218,7 +218,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
     if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
         DEBUG_PRINTF("using cached succ from previous state\n");
         STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), LOAD_STATE(&ctx->cached_esucc)));
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
             if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                          ctx->context, offset)
                         == MO_HALT_MATCHING)) {
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 675c18bd..471e4bf0 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -83,7 +83,8 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
     if (estate == ctx->cached_estate) {
         DEBUG_PRINTF("using cached succ from previous state\n");
         *succ |= ctx->cached_esucc;
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
             if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                          ctx->context, offset)
                         == MO_HALT_MATCHING)) {
@@ -119,7 +120,9 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
         ctx->cached_reports = new_cache.reports;
         ctx->cached_br = new_cache.br;
     } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
-        ctx->cached_estate = 0U;
+        if (ctx->cached_br) {
+            ctx->cached_estate = 0U;
+        }
     }
 
     return 0;
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 611fe747..6ef3bae9 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -179,7 +179,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     assert(ISALIGNED_CL(ctx));
     assert(ISALIGNED_CL(&ctx->s));
     STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */
@@ -305,7 +304,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     const ReportID *exReports = getExReports(limex);
     const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */
@@ -542,7 +540,6 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
     ctx->callback = q->cb;
     ctx->context = q->context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     assert(q->items[q->cur].location >= 0);
     DEBUG_PRINTF("LOAD STATE\n");
@@ -638,7 +635,6 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
     ctx->callback = q->cb;
     ctx->context = q->context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     DEBUG_PRINTF("LOAD STATE\n");
     STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@@ -730,7 +726,6 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
     ctx->callback = NULL;
     ctx->context = NULL;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     DEBUG_PRINTF("LOAD STATE\n");
     STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@@ -833,7 +828,6 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
     ctx->callback = cb;
     ctx->context = context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     const IMPL_NFA_T *limex = getImplNfa(n);
     STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 39d29359..f75d08b5 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -700,7 +700,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
     ReportID arb;
     u8 single;
     u32 accelCount;
+
     u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
     u16 count_real_states;
     if (allocateFSN16(info, &count_real_states)) {
         DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
@@ -843,6 +846,7 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
                        const vector<u32> &reports_eod, u32 i) {
     dstate_id_t j = info.implId(i);
     u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
 
     for (size_t s = 0; s < info.impl_alpha_size; s++) {
         dstate_id_t raw_succ = info.states[i].next[s];
diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index b6e702ac..e731df87 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -70,9 +70,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
         break;
     case MPV_VERM:
         if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "verm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "verm 0x%02x\n", k->u.verm.c);
         } else {
-            fprintf(f, "verm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "verm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
         }
         break;
     case MPV_SHUFTI:
@@ -87,9 +87,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
         break;
     case MPV_NVERM:
         if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "nverm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "nverm 0x%02x\n", k->u.verm.c);
         } else {
-            fprintf(f, "nverm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "nverm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
         }
         break;
     default:
diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index d8079292..59c18fca 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -196,6 +196,14 @@ static really_inline s64a q_cur_loc(const struct mq *q) {
     return q->items[q->cur].location;
 }
 
+/** \brief Returns the type of the last event in the queue. */
+static really_inline u32 q_last_type(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->end > 0);
+    assert(q->end <= MAX_MQE_LEN);
+    return q->items[q->end - 1].type;
+}
+
 /** \brief Returns the location (relative to the beginning of the current data
  * buffer) of the last event in the queue. */
 static really_inline s64a q_last_loc(const struct mq *q) {
@@ -269,7 +277,7 @@ void debugQueue(const struct mq *q) {
             type = "MQE_TOP_N";
             break;
         }
-        DEBUG_PRINTF("\tq[%u] %lld %d:%s\n", cur, q->items[cur].location,
+        DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location,
                      q->items[cur].type, type);
     }
 }
diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index f84eed71..c1ff5162 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -39,6 +39,8 @@
 #include "util/pack_bits.h"
 #include "util/partial_store.h"
 #include "util/unaligned.h"
+
+#include <stdint.h>
 #include <string.h>
 
 /** \brief Returns the total capacity of the ring.
@@ -709,12 +711,7 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
     dumpRing(info, xs, ring);
 #endif
 
-    // We work in terms of the distance between the current offset and the base
-    // offset in our history.
-    u64a delta = offset - xs->offset;
-    DEBUG_PRINTF("delta=%llu\n", delta);
-
-    if (delta < info->repeatMin) {
+    if (offset - xs->offset < info->repeatMin) {
         DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
         return REPEAT_NOMATCH;
     }
@@ -724,17 +721,22 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
         return REPEAT_STALE;
     }
 
+    // If we're not stale, delta fits in the range [repeatMin, lastTop +
+    // repeatMax], which fits in a u32.
+    assert(offset - xs->offset < UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    DEBUG_PRINTF("delta=%u\n", delta);
+
     // Find the bounds on possible matches in the ring buffer.
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin + 1;
-    upper = MIN(upper, ringOccupancy(xs, ringSize));
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize));
 
     if (lower >= upper) {
         DEBUG_PRINTF("no matches to check\n");
         return REPEAT_NOMATCH;
     }
 
-    DEBUG_PRINTF("possible match indices=[%llu,%llu]\n", lower, upper);
+    DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper);
     if (ringHasMatch(xs, ring, ringSize, lower, upper)) {
         return REPEAT_MATCH;
     }
@@ -1163,7 +1165,7 @@ static
 void storeInitialRingTopPatch(const struct RepeatInfo *info,
                               struct RepeatRingControl *xs,
                               u8 *state, u64a offset) {
-    DEBUG_PRINTF("set the first patch\n");
+    DEBUG_PRINTF("set the first patch, offset=%llu\n", offset);
     xs->offset = offset;
 
     u8 *active = state;
@@ -1197,12 +1199,10 @@ u32 getSparseOptimalTargetValue(const struct RepeatInfo *info,
     return loc;
 }
 
-u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
-                                 const union RepeatControl *ctrl,
-                                 const void *state) {
+static
+u64a sparseLastTop(const struct RepeatInfo *info,
+                   const struct RepeatRingControl *xs, const u8 *state) {
     DEBUG_PRINTF("looking for last top\n");
-    const struct RepeatRingControl *xs = &ctrl->ring;
-
     u32 patch_size = info->patchSize;
     u32 patch_count = info->patchCount;
     u32 encoding_size = info->encodingSize;
@@ -1214,7 +1214,7 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
     }
 
     DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ);
-    const u8 *ring = (const u8 *)state + info->patchesOffset;
+    const u8 *ring = state + info->patchesOffset;
     u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size);
 
     DEBUG_PRINTF("val:%llu\n", val);
@@ -1231,6 +1231,12 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
     return 0;
 }
 
+u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+                                 const union RepeatControl *ctrl,
+                                 const void *state) {
+    return sparseLastTop(info, &ctrl->ring, state);
+}
+
 u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
                                    const union RepeatControl *ctrl,
                                    const void *state, u64a offset) {
@@ -1249,20 +1255,20 @@ u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
     if (nextOffset <= xs->offset + info->repeatMin) {
         patch = xs->first;
         tval = 0;
-    } else if (nextOffset >
-               repeatLastTopSparseOptimalP(info, ctrl, state) +
-               info->repeatMax) {
+    } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) {
+        DEBUG_PRINTF("ring is stale\n");
         return 0;
     } else {
-        u64a delta = nextOffset - xs->offset;
-        u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+        assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale
+        u32 delta = (u32)(nextOffset - xs->offset);
+        u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
         patch = lower / patch_size;
         tval = lower - patch * patch_size;
     }
 
     DEBUG_PRINTF("patch %u\n", patch);
     u32 patch_count = info->patchCount;
-    if (patch >= patch_count){
+    if (patch >= patch_count) {
         return 0;
     }
 
@@ -1336,21 +1342,32 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                                union RepeatControl *ctrl, void *state,
                                u64a offset, char is_alive) {
     struct RepeatRingControl *xs = &ctrl->ring;
-
-    u64a delta = offset - xs->offset;
-    u32 patch_size = info->patchSize;
-    u32 patch_count = info->patchCount;
-    u32 encoding_size = info->encodingSize;
-    u32 patch = delta / patch_size;
-    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset, encoding_size);
-
     u8 *active = (u8 *)state;
-    if (!is_alive) {
+
+    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset,
+                 info->encodingSize);
+
+    // If (a) this is the first top, or (b) the ring is stale, initialize the
+    // ring and write this offset in as the first top.
+    if (!is_alive ||
+        offset > sparseLastTop(info, xs, state) + info->repeatMax) {
         storeInitialRingTopPatch(info, xs, active, offset);
         return;
     }
 
-    assert(offset >= xs->offset);
+    // Tops should arrive in order, with no duplicates.
+    assert(offset > sparseLastTop(info, xs, state));
+
+    // As the ring is not stale, our delta should fit within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+    u32 patch = delta / patch_size;
+
+    DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size,
+                 patch);
 
     u8 *ring = active + info->patchesOffset;
     u32 occ = ringOccupancy(xs, patch_count);
@@ -1361,10 +1378,6 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                  patch, patch_count, occ);
     if (patch >= patch_count) {
         u32 patch_shift_count = patch - patch_count + 1;
-        if (patch_shift_count >= patch_count) {
-            storeInitialRingTopPatch(info, xs, active, offset);
-            return;
-        }
         assert(patch >= patch_shift_count);
         DEBUG_PRINTF("shifting by %u\n", patch_shift_count);
         xs->offset += patch_size * patch_shift_count;
@@ -1401,7 +1414,8 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
         }
     }
 
-    u64a diff = delta - patch * patch_size;
+    assert((u64a)patch * patch_size <= delta);
+    u32 diff = delta - patch * patch_size;
     const u64a *repeatTable = getImplTable(info);
     val += repeatTable[diff];
 
@@ -1480,7 +1494,7 @@ char sparseHasMatch(const struct RepeatInfo *info, const u8 *state,
 enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
                                               const union RepeatControl *ctrl,
                                               const void *state, u64a offset) {
-    DEBUG_PRINTF("check for match at %llu corresponding to trigger"
+    DEBUG_PRINTF("check for match at %llu corresponding to trigger "
                  "at [%llu, %llu]\n", offset, offset - info->repeatMax,
                  offset - info->repeatMin);
 
@@ -1492,21 +1506,25 @@ enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
     if (offset < xs->offset + info->repeatMin) {
         DEBUG_PRINTF("too soon\n");
         return REPEAT_NOMATCH;
-    } else if (offset > repeatLastTopSparseOptimalP(info, ctrl, state) +
-                        info->repeatMax) {
+    } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) {
         DEBUG_PRINTF("stale\n");
         return REPEAT_STALE;
     }
 
-    u64a delta = offset - xs->offset;
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin;
+    // Our delta between the base offset of the ring and the current offset
+    // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This
+    // range fits comfortably within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+
+    u32 delta = (u32)(offset - xs->offset);
     u32 patch_size = info->patchSize;
     u32 patch_count = info->patchCount;
     u32 occ = ringOccupancy(xs, patch_count);
-    upper = MIN(upper, occ * patch_size - 1);
 
-    DEBUG_PRINTF("lower=%llu, upper=%llu\n", lower, upper);
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1);
+
+    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
     u32 patch_lower = lower / patch_size;
     u32 patch_upper = upper / patch_size;
 
diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 25e57581..2f187503 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -75,8 +75,8 @@ u32 calcPackedBytes(u64a val) {
 }
 
 static
-u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
-                      const u32 minPeriod) {
+u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
+                     const u32 minPeriod) {
     u32 repeatTmp = info->patchCount > 2 ? 64 : (u32)repeatMax;
     u32 repeat_index = repeatTmp < minPeriod ? repeatTmp : minPeriod;
     for (u32 i = 0; i <= repeat_index; i++) {
@@ -93,7 +93,7 @@ u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
 
 static
 u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
-                         const u32 minPeriod, u64a rv) {
+                         const u32 minPeriod, u32 rv) {
     u32 cnt = 0;
     u32 patch_bits = 0;
     u32 total_size = 0;
@@ -171,7 +171,7 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
         assert(minPeriod);
         assert(repeatMax.is_finite());
         {
-            u64a rv = repeatRecurTable(this, repeatMax, minPeriod);
+            u32 rv = repeatRecurTable(this, repeatMax, minPeriod);
             u32 repeatTmp = 0;
             if ((u32)repeatMax < minPeriod) {
                 repeatTmp = repeatMax;
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index fe78254c..52353da9 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -64,7 +64,7 @@ public:
               bool prefilter, const som_type som, ReportID rid, u64a min_offset,
               u64a max_offset, u64a min_length);
 
-    ~NGWrapper();
+    ~NGWrapper() override;
 
     /** index of the expression represented by this graph, used
      * - down the track in error handling
diff --git a/src/nfagraph/ng_depth.cpp b/src/nfagraph/ng_depth.cpp
index 719c7654..d7945be9 100644
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -55,14 +55,14 @@ namespace ue2 {
 namespace {
 
 /** Distance value used to indicate that the vertex can't be reached. */
-static const int DIST_UNREACHABLE = INT_MAX;
+static constexpr int DIST_UNREACHABLE = INT_MAX;
 
 /**
  * Distance value used to indicate that the distance to a vertex is infinite
  * (for example, it's the max distance and there's a cycle in the path) or so
  * large that we should consider it effectively infinite.
  */
-static const int DIST_INFINITY = INT_MAX - 1;
+static constexpr int DIST_INFINITY = INT_MAX - 1;
 
 //
 // Filters
@@ -71,10 +71,12 @@ static const int DIST_INFINITY = INT_MAX - 1;
 template <class GraphT>
 struct NodeFilter {
     typedef typename GraphT::edge_descriptor EdgeT;
-    NodeFilter() { }
+    NodeFilter() {} // BGL filters must be default-constructible.
     NodeFilter(const vector<bool> *bad_in, const GraphT *g_in)
         : bad(bad_in), g(g_in) { }
     bool operator()(const EdgeT &e) const {
+        assert(g && bad);
+
         u32 src_idx = (*g)[source(e, *g)].index;
         u32 tar_idx = (*g)[target(e, *g)].index;
 
@@ -84,16 +86,20 @@ struct NodeFilter {
 
         return !(*bad)[src_idx] && !(*bad)[tar_idx];
     }
-    const vector<bool> *bad;
-    const GraphT *g;
+
+private:
+    const vector<bool> *bad = nullptr;
+    const GraphT *g = nullptr;
 };
 
 template <class GraphT>
 struct StartFilter {
     typedef typename GraphT::edge_descriptor EdgeT;
-    StartFilter() { }
+    StartFilter() {} // BGL filters must be default-constructible.
     explicit StartFilter(const GraphT *g_in) : g(g_in) { }
     bool operator()(const EdgeT &e) const {
+        assert(g);
+
         u32 src_idx = (*g)[source(e, *g)].index;
         u32 tar_idx = (*g)[target(e, *g)].index;
 
@@ -107,7 +113,9 @@ struct StartFilter {
         }
         return true;
     }
-    const GraphT *g;
+
+private:
+    const GraphT *g = nullptr;
 };
 
 } // namespace
diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index aebfa712..92bef737 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -125,61 +125,62 @@ void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
 }
 
 static
-void fillStateBitset(const NGHolder &g, const set<NFAVertex> &in,
-                     dynamic_bitset<> &out) {
-    out.reset();
-    for (auto v : in) {
+dynamic_bitset<> makeStateBitset(const NGHolder &g,
+                                 const flat_set<NFAVertex> &in) {
+    dynamic_bitset<> work_states(num_vertices(g));
+    for (const auto &v : in) {
         u32 idx = g[v].index;
-        out.set(idx);
+        work_states.set(idx);
     }
+    return work_states;
 }
 
 static
-void fillVertexSet(const dynamic_bitset<> &in,
-                   const vector<StateInfo> &info, set<NFAVertex> &out) {
-    out.clear();
+flat_set<NFAVertex> getVertices(const dynamic_bitset<> &in,
+                                const vector<StateInfo> &info) {
+    flat_set<NFAVertex> out;
     for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
         out.insert(info[i].vertex);
     }
+    return out;
 }
 
 static
-void fillInfoTable(const NGHolder &g, vector<StateInfo> &info) {
-    info.resize(num_vertices(g));
+vector<StateInfo> makeInfoTable(const NGHolder &g) {
+    vector<StateInfo> info(num_vertices(g));
     for (auto v : vertices_range(g)) {
         u32 idx = g[v].index;
         const CharReach &cr = g[v].char_reach;
         assert(idx < info.size());
         info[idx] = StateInfo(v, cr);
     }
+    return info;
 }
 
-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   set<NFAVertex> *states, bool kill_sds) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial_states,
+                                  bool kill_sds) {
     assert(hasCorrectlyNumberedVertices(g));
 
-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);
 
     execute_graph_i(g, info, input, &work_states, kill_sds);
 
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
 
-void execute_graph(const NGHolder &g, const vector<CharReach> &input,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial_states) {
     assert(hasCorrectlyNumberedVertices(g));
 
-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);
 
     execute_graph_i(g, info, input, &work_states, false);
 
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
 
 typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
@@ -276,9 +277,10 @@ private:
 };
 } // namespace
 
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   const set<NFAVertex> &input_start_states,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial_states) {
     DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
                  num_vertices(running_g), num_vertices(input_dag));
     assert(hasCorrectlyNumberedVertices(running_g));
@@ -290,10 +292,8 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
     RevNFAGraph revg(input_dag.g);
     map<NFAVertex, dynamic_bitset<> > dfs_states;
 
-    vector<StateInfo> info;
-    fillInfoTable(running_g, info);
-    dynamic_bitset<> input_fs(num_vertices(running_g));
-    fillStateBitset(running_g, *states, input_fs);
+    auto info = makeInfoTable(running_g);
+    auto input_fs = makeStateBitset(running_g, initial_states);
 
     for (auto v : input_start_states) {
         dfs_states[v] = input_fs;
@@ -303,21 +303,25 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
                       eg_visitor(running_g, info, input_dag, dfs_states),
                       make_assoc_property_map(colours));
 
-    fillVertexSet(dfs_states[input_dag.accept], info, *states);
+    auto states = getVertices(dfs_states[input_dag.accept], info);
 
 #ifdef DEBUG
-        DEBUG_PRINTF("  output rstates:");
-        for (auto v : *states) {
-            printf(" %u", running_g[v].index);
-        }
-        printf("\n");
+    DEBUG_PRINTF("  output rstates:");
+    for (const auto &v : states) {
+        printf(" %u", running_g[v].index);
+    }
+    printf("\n");
 #endif
+
+    return states;
 }
 
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   set<NFAVertex> *states) {
-    set<NFAVertex> input_start_states = {input_dag.start, input_dag.startDs};
-    execute_graph(running_g, input_dag, input_start_states, states);
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial_states) {
+    auto input_start_states = {input_dag.start, input_dag.startDs};
+    return execute_graph(running_g, input_dag, input_start_states,
+                         initial_states);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_execute.h b/src/nfagraph/ng_execute.h
index 80fdcbd5..e2c7c72d 100644
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -35,8 +35,8 @@
 #define NG_EXECUTE_H
 
 #include "ng_holder.h"
+#include "util/ue2_containers.h"
 
-#include <set>
 #include <vector>
 
 namespace ue2 {
@@ -44,23 +44,25 @@ namespace ue2 {
 class CharReach;
 struct ue2_literal;
 
-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   std::set<NFAVertex> *states, bool kill_sds = false);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial,
+                                  bool kill_sds = false);
 
-void execute_graph(const NGHolder &g, const std::vector<CharReach> &input,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const std::vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial);
 
 /** on exit, states contains any state which may still be enabled after
  * receiving an input which corresponds to some path through the input_dag from
  * start or startDs to accept. input_dag MUST be acyclic aside from self-loops.
  */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial);
 
 /* as above, but able to specify the source states for the input graph */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   const std::set<NFAVertex> &input_start_states,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 7681fa14..d06083bd 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -114,7 +114,7 @@ void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) {
 }
 
 class Automaton_Base {
-public:
+protected:
     Automaton_Base(const NGHolder &graph_in,
                    const ue2::unordered_map<NFAVertex, u32> &state_ids_in)
         : graph(graph_in), state_ids(state_ids_in) {
@@ -122,6 +122,7 @@ public:
         assert(alphasize <= ALPHABET_SIZE);
     }
 
+public:
     static bool canPrune(const flat_set<ReportID> &) { return false; }
 
     const NGHolder &graph;
@@ -608,7 +609,6 @@ bool doHaig(const NGHolder &g,
     }
 
     haig_note_starts(g, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;
 
     return true;
 }
@@ -638,7 +638,8 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
         return nullptr;
     }
 
-    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
+                                              somPrecision);
 
     DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
     bool rv;
@@ -658,7 +659,6 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
 
     DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n",
                  rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = somPrecision;
 
     assert(rdfa->kind == g.kind);
     return rdfa;
@@ -782,7 +782,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
 
     typedef Automaton_Haig_Merge::StateSet StateSet;
     vector<StateSet> nfa_state_map;
-    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
+                                              NODE_START,
+                                              dfas[0]->stream_som_loc_width);
 
     int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
     if (rv) {
@@ -830,11 +832,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
     }
 
     haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;
 
     DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
                  rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = dfas[0]->stream_som_loc_width;
 
     return rdfa;
 }
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index 11eded69..b9cacaa7 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -98,8 +98,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
     info->packedCtrlSize = rsi.packedCtrlSize;
     info->horizon = rsi.horizon;
     info->minPeriod = minPeriod;
-    memcpy(&info->packedFieldSizes, rsi.packedFieldSizes.data(),
-           byte_length(rsi.packedFieldSizes));
+    copy_bytes(&info->packedFieldSizes, rsi.packedFieldSizes);
     info->patchCount = rsi.patchCount;
     info->patchSize = rsi.patchSize;
     info->encodingSize = rsi.encodingSize;
@@ -122,7 +121,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
         nfa->length = verify_u32(len);
         info->length = verify_u32(sizeof(RepeatInfo)
                                   + sizeof(u64a) * (rsi.patchSize + 1));
-        memcpy(table, rsi.table.data(), byte_length(rsi.table));
+        copy_bytes(table, rsi.table);
     }
 }
 
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index cd190171..501d8f7b 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -316,7 +316,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
     bool unbounded = false;
     bool exhaustible = can_exhaust(g, rm);
 
-    while (a) {
+    while (true) {
         if (is_special(a, g)) {
             DEBUG_PRINTF("stopped puffing due to special vertex\n");
             break;
@@ -350,9 +350,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
 
         a = getSoleSourceVertex(g, a);
 
-        if (!a) {
-            break;
-        }
+        assert(a); /* already checked that old a had a proper in degree of 1 */
 
         // Snark: we can't handle this case, because we can only handle a
         // single report ID on a vertex
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 90ebb5c3..f26b62aa 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -266,7 +266,7 @@ bool validateEXSL(const NGHolder &g,
     const vector<CharReach> escapes_vec(1, escapes);
     const vector<CharReach> notescapes_vec(1, ~escapes);
 
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states past the prefix */
     DEBUG_PRINTF("region %u is cutover\n", region);
     for (auto v : vertices_range(g)) {
@@ -276,20 +276,20 @@ bool validateEXSL(const NGHolder &g,
     }
 
     /* process the escapes */
-    execute_graph(g, escapes_vec, &states);
+    states = execute_graph(g, escapes_vec, states);
 
     /* flood with any number of not escapes */
-    set<NFAVertex> prev_states;
+    ue2::flat_set<NFAVertex> prev_states;
     while (prev_states != states) {
         prev_states = states;
-        execute_graph(g, notescapes_vec, &states);
+        states = execute_graph(g, notescapes_vec, states);
         insert(&states, prev_states);
     }
 
     /* find input starts to use for when we are running the prefix through as
      * when the escape character arrives we may be in matching the prefix
      * already */
-    set<NFAVertex> prefix_start_states;
+    ue2::flat_set<NFAVertex> prefix_start_states;
     for (auto v : vertices_range(prefix)) {
         if (v != prefix.accept && v != prefix.acceptEod
             /* and as we have already made it past the prefix once */
@@ -298,11 +298,12 @@ bool validateEXSL(const NGHolder &g,
         }
     }
 
-    execute_graph(prefix, escapes_vec, &prefix_start_states);
+    prefix_start_states =
+        execute_graph(prefix, escapes_vec, prefix_start_states);
 
     assert(contains(prefix_start_states, prefix.startDs));
     /* see what happens after we feed it the prefix */
-    execute_graph(g, prefix, prefix_start_states, &states);
+    states = execute_graph(g, prefix, prefix_start_states, states);
 
     for (auto v : states) {
         assert(v != g.accept && v != g.acceptEod); /* no cr -> should never be
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 7f487f89..a0829451 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -136,7 +136,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
         return false;
     }
 
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states (except starts - avoid suffix matches) */
     /* If we were doing (1) we would also except states leading to accepts -
        avoid prefix matches */
@@ -149,7 +149,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
     }
 
     /* run the prefix the main graph */
-    execute_graph(p, p, &states);
+    states = execute_graph(p, p, states);
 
     for (auto v : states) {
         /* need to check if this vertex may represent an infix match - ie
@@ -313,7 +313,7 @@ bool sentClearsTail(const NGHolder &g,
      */
 
     u32 first_bad_region = ~0U;
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states */
     DEBUG_PRINTF("region %u is cutover\n", last_head_region);
     for (auto v : vertices_range(g)) {
@@ -327,7 +327,7 @@ bool sentClearsTail(const NGHolder &g,
     }
 
     /* run the prefix the main graph */
-    execute_graph(g, sent, &states);
+    states = execute_graph(g, sent, states);
 
     /* .. and check if we are left with anything in the tail region */
     for (auto v : states) {
diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp
index 3ecb391c..470f9343 100644
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@@ -51,10 +51,16 @@ namespace ue2 {
 
 namespace {
 
-/** Filter out edges from start-to-start or accept-to-accept. */
+/**
+ * Filter out special edges, or in the top-specific variant, start edges that
+ * don't have the right top set.
+ */
 struct SpecialEdgeFilter {
     SpecialEdgeFilter() {}
-    explicit SpecialEdgeFilter(const NGHolder *h_in) : h(h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in) : h(&h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
+        : h(&h_in), single_top(true), top(top_in) {}
+
     bool operator()(const NFAEdge &e) const {
         const NFAGraph &g = h->g;
         NFAVertex u = source(e, g), v = target(e, g);
@@ -62,23 +68,33 @@ struct SpecialEdgeFilter {
             (is_any_accept(u, g) && is_any_accept(v, g))) {
             return false;
         }
+        if (single_top) {
+            if (u == h->start && g[e].top != top) {
+                return false;
+            }
+            if (u == h->startDs) {
+                return false;
+            }
+        }
         return true;
 
     }
 private:
     const NGHolder *h = nullptr;
+    bool single_top = false;
+    u32 top = 0;
 };
 
 } // namespace
 
 static
-depth findMinWidth(const NGHolder &h, NFAVertex src) {
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
     if (isLeafNode(src, h)) {
         return depth::unreachable();
     }
 
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> StartGraph;
-    StartGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -112,7 +128,8 @@ depth findMinWidth(const NGHolder &h, NFAVertex src) {
 }
 
 static
-depth findMaxWidth(const NGHolder &h, NFAVertex src) {
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
     if (isLeafNode(src, h.g)) {
         return depth::unreachable();
     }
@@ -122,8 +139,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
         return depth::infinity();
     }
 
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> NodeFilteredGraph;
-    NodeFilteredGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -164,7 +180,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
     if (d.is_unreachable()) {
         // If we're actually reachable, we'll have a min width, so we can
         // return infinity in this case.
-        if (findMinWidth(h, src).is_reachable()) {
+        if (findMinWidth(h, filter, src).is_reachable()) {
             return depth::infinity();
         }
         return d;
@@ -175,11 +191,10 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
     return d - depth(1);
 }
 
-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
-depth findMinWidth(const NGHolder &h) {
-    depth startDepth = findMinWidth(h, h.start);
-    depth dotstarDepth = findMinWidth(h, h.startDs);
+static
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMinWidth(h, filter, h.start);
+    depth dotstarDepth = findMinWidth(h, filter, h.startDs);
     DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                  dotstarDepth.str().c_str());
     if (startDepth.is_unreachable()) {
@@ -194,11 +209,18 @@ depth findMinWidth(const NGHolder &h) {
     }
 }
 
-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
-depth findMaxWidth(const NGHolder &h) {
-    depth startDepth = findMaxWidth(h, h.start);
-    depth dotstarDepth = findMaxWidth(h, h.startDs);
+depth findMinWidth(const NGHolder &h) {
+    return findMinWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMinWidth(const NGHolder &h, u32 top) {
+    return findMinWidth(h, SpecialEdgeFilter(h, top));
+}
+
+static
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMaxWidth(h, filter, h.start);
+    depth dotstarDepth = findMaxWidth(h, filter, h.startDs);
     DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                  dotstarDepth.str().c_str());
     if (startDepth.is_unreachable()) {
@@ -210,4 +232,12 @@ depth findMaxWidth(const NGHolder &h) {
     }
 }
 
+depth findMaxWidth(const NGHolder &h) {
+    return findMaxWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMaxWidth(const NGHolder &h, u32 top) {
+    return findMaxWidth(h, SpecialEdgeFilter(h, top));
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_width.h b/src/nfagraph/ng_width.h
index 5c78409b..871e8a93 100644
--- a/src/nfagraph/ng_width.h
+++ b/src/nfagraph/ng_width.h
@@ -41,14 +41,34 @@ namespace ue2 {
 
 class NGHolder;
 
-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph.
+ */
 depth findMinWidth(const NGHolder &h);
 
-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ */
+depth findMinWidth(const NGHolder &h, u32 top);
+
+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
 depth findMaxWidth(const NGHolder &h);
 
+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
+depth findMaxWidth(const NGHolder &h, u32 top);
+
 } // namespace ue2
 
 #endif // NG_WIDTH_H
diff --git a/src/parser/AsciiComponentClass.cpp b/src/parser/AsciiComponentClass.cpp
index de7a40d4..7cfa6e11 100644
--- a/src/parser/AsciiComponentClass.cpp
+++ b/src/parser/AsciiComponentClass.cpp
@@ -52,7 +52,8 @@ AsciiComponentClass *AsciiComponentClass::clone() const {
 }
 
 bool AsciiComponentClass::class_empty(void) const {
-    return cr.none() && cr_ucp.none();
+    assert(finalized);
+    return cr.none();
 }
 
 void AsciiComponentClass::createRange(unichar to) {
@@ -60,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
     unsigned char from = (u8)range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        cr.setRange(from, to);
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CharReach ncr(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+    cr |= ncr;
+    range_start = INVALID_UNICODE;
 }
 
 void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@@ -94,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
         c = translateForUcpMode(c, mode);
     }
 
+    // Note: caselessness is handled by getPredefinedCharReach.
     CharReach pcr = getPredefinedCharReach(c, mode);
     if (negative) {
         pcr.flip();
     }
 
-    if (isUcp(c)) {
-        cr_ucp |= pcr;
-    } else {
-        cr |= pcr;
-    }
+    cr |= pcr;
     range_start = INVALID_UNICODE;
     in_cand_range = false;
 }
@@ -119,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
         return;
     }
 
-    cr.set(c);
+    CharReach ncr(c, c);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+
+    cr |= ncr;
     range_start = c;
 }
 
@@ -135,12 +142,6 @@ void AsciiComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    if (mode.caseless) {
-        make_caseless(&cr);
-    }
-
-    cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
-
     if (m_negate) {
         cr.flip();
     }
diff --git a/src/parser/AsciiComponentClass.h b/src/parser/AsciiComponentClass.h
index 2d5ef843..925fa9bf 100644
--- a/src/parser/AsciiComponentClass.h
+++ b/src/parser/AsciiComponentClass.h
@@ -78,12 +78,10 @@ protected:
 private:
     Position position;
     CharReach cr;
-    CharReach cr_ucp;
 
     // Private copy ctor. Use clone instead.
     AsciiComponentClass(const AsciiComponentClass &other)
-        : ComponentClass(other), position(other.position), cr(other.cr),
-          cr_ucp(other.cr_ucp) {}
+        : ComponentClass(other), position(other.position), cr(other.cr) {}
 };
 
 } // namespace ue2
diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp
index d90d7db2..a91ae979 100644
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
     case CLASS_DIGIT:
         return number;
     case CLASS_GRAPH:
-    case CLASS_XGRAPH:
         return CharReach(0x21, 0x7e);
+    case CLASS_XGRAPH:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_HORZ:
         return CharReach("\x09\x20\xA0");
     case CLASS_LOWER:
@@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
         }
     case CLASS_PRINT:
         return CharReach(0x20, 0x7e);
+    case CLASS_XPRINT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_PUNCT:
         return CharReach(0x21, '0' - 1)
             | CharReach('9' + 1, 'A' - 1)
             | CharReach('Z' + 1, 'a' - 1)
             | CharReach('z' + 1, 126);
+    case CLASS_XPUNCT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_SPACE:
         return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
     case CLASS_UPPER:
@@ -420,7 +425,7 @@ unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
 
 ComponentClass::ComponentClass(const ParseMode &mode_in)
     : m_negate(false), mode(mode_in), in_cand_range(false),
-      range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
+      range_start(INVALID_UNICODE), finalized(false) {}
 
 ComponentClass::~ComponentClass() { }
 
@@ -441,7 +446,6 @@ void ComponentClass::addDash(void) {
 }
 
 void ComponentClass::negate() {
-    assert(class_empty());
     m_negate = true;
 }
 
diff --git a/src/parser/ComponentClass.h b/src/parser/ComponentClass.h
index 21b51202..040e6d78 100644
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@@ -63,7 +63,9 @@ enum PredefinedClass {
     CLASS_VERT,
     CLASS_WORD,
     CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
+    CLASS_XPRINT, /* [:print:] in UCP mode */
+    CLASS_XPUNCT, /* [:punct:] in UCP mode */
     CLASS_UCP_C,
     CLASS_UCP_CC,
     CLASS_UCP_CF,
@@ -232,8 +234,12 @@ public:
     Component *accept(ComponentVisitor &v) override = 0;
     void accept(ConstComponentVisitor &v) const override = 0;
 
-     /** True iff we have already started adding members to the class. This is
-      * a different concept to Component::empty */
+    /** \brief True if the class contains no members (i.e. it will not match
+     * against anything). This function can only be called on a finalized
+     * class.
+     *
+     * Note: This is a different concept to Component::empty.
+     */
     virtual bool class_empty(void) const = 0;
 
     virtual void add(PredefinedClass c, bool negated) = 0;
@@ -245,9 +251,6 @@ public:
 
     bool isNegated() const { return m_negate; }
 
-    void setFirstChar(char c) { firstChar = c; }
-    char getFirstChar() const { return firstChar; }
-
     std::vector<PositionInfo> first() const override = 0;
     std::vector<PositionInfo> last() const override = 0;
     bool empty() const override { return false; } /* always 1 codepoint wide */
@@ -263,19 +266,13 @@ protected:
     unichar range_start;
     bool finalized;
 
-    /** Literal character at the start of this character class, e.g. '.' for
-     * the class [.abc]. Used to identify (unsupported) POSIX collating
-     * elements. */
-    char firstChar;
-
     virtual void createRange(unichar) = 0;
 
     // Protected copy ctor. Use clone instead.
     ComponentClass(const ComponentClass &other)
         : Component(other), m_negate(other.m_negate), mode(other.mode),
           in_cand_range(other.in_cand_range), range_start(other.range_start),
-          finalized(other.finalized),
-          firstChar(other.firstChar) {}
+          finalized(other.finalized) {}
 };
 
 } // namespace ue2
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 1481b7d8..65cd7c1a 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
         assert(!inCharClass); // not reentrant
         currentCls = getComponentClass(mode);
         inCharClass = true;
+        inCharClassEarly = true;
         currentClsBegin = ts;
         fgoto readClass;
     }
@@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
     }
     action is_utf8 { mode.utf8 }
     action is_ignore_space { mode.ignore_space }
+    action is_early_charclass { inCharClassEarly }
 
     action addNumberedBackRef {
         if (accumulator == 0) {
@@ -790,10 +792,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
         any => { throw LocatedParseError("Unknown property"); };
                      *|;
     charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
-              '\[\.' [^\]]* '\.\]' | 
-              '\[=' [^\]]* '=\]' => {
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
               };
               # Named sets
               # Adding these may cause the charclass to close, hence the
@@ -889,11 +893,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throw LocatedParseError("Invalid POSIX named class");
               };
               '\\Q' => {
-                  // fcall readQuotedClass;
-                  ostringstream str;
-                  str << "\\Q..\\E sequences in character classes not supported at index "
-                      << ts - ptr << ".";
-                  throw ParseError(str.str());
+                  fcall readQuotedClass;
               };
               '\\E' => { /*noop*/};
               # Backspace (this is only valid for \b in char classes)
@@ -1090,28 +1090,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throwInvalidUtf8();
               };
 
-              # dot or equals at the end of a character class could be the end
-              # of a collating element, like [.blah.] or [=blah=].
-              [.=] ']' => {
-                  if (currentCls->getFirstChar() == *ts) {
-                      assert(currentClsBegin);
-                      ostringstream oss;
-                      oss << "Unsupported POSIX collating element at index "
-                          << currentClsBegin - ptr << ".";
-                      throw ParseError(oss.str());
-                  }
-                  currentCls->add(*ts);
-                  currentCls->finalize();
-                  currentSeq->addComponent(move(currentCls));
-                  inCharClass = false;
-                  fgoto main;
-              };
-
               # Literal character
               (any - ']') => {
-                  if (currentCls->class_empty()) {
-                      currentCls->setFirstChar(*ts);
-                  }
                   currentCls->add(*ts);
               };
 
@@ -1127,35 +1107,35 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
     # Parser to read stuff from a character class
     #############################################################
     readClass := |*
-        # the negate and right bracket out the front are special
-        '\^' => {
+        # A caret at the beginning of the class means that the rest of the
+        # class is negated.
+        '\^' when is_early_charclass => {
             if (currentCls->isNegated()) {
+                // Already seen a caret; the second one is not a meta-character.
+                inCharClassEarly = false;
                 fhold; fgoto charClassGuts;
             } else {
                 currentCls->negate();
+                // Note: we cannot switch off inCharClassEarly here, as /[^]]/
+                // needs to use the right square bracket path below.
             }
         };
-        ']' => {
-            // if this is the first thing in the class, add it and move along,
-            // otherwise jump into the char class machine to handle what might
-            // end up as fail
-            if (currentCls->class_empty()) {
-                currentCls->add(']');
-            } else {
-                // leave it for the next machine
-                fhold;
-            }
-            fgoto charClassGuts;
+        # A right square bracket before anything "real" is interpreted as a
+        # literal right square bracket.
+        ']' when is_early_charclass => {
+            currentCls->add(']');
+            inCharClassEarly = false;
         };
         # if we hit a quote before anything "real", handle it
-        #'\\Q' => { fcall readQuotedClass; };
-        '\\Q' => {
-            throw LocatedParseError("\\Q..\\E sequences in character classes not supported");
-        };
+        '\\Q' => { fcall readQuotedClass; };
         '\\E' => { /*noop*/};
 
         # time for the real work to happen
-        any => { fhold; fgoto charClassGuts; };
+        any => {
+            inCharClassEarly = false;
+            fhold;
+            fgoto charClassGuts;
+        };
         *|;
 
     #############################################################
@@ -1183,6 +1163,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               # Literal character
               any => {
                   currentCls->add(*ts);
+                  inCharClassEarly = false;
               };
             *|;
 
@@ -1232,6 +1213,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throw LocatedParseError("POSIX named classes are only "
                                           "supported inside a class");
               };
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
+              };
               # Begin eating characters for class
               '\[' => eatClass;
               # Begin quoted literal
@@ -1896,6 +1884,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
     // brackets [..].
     bool inCharClass = false;
 
+    // True if the machine is inside a character class but it has not processed
+    // any "real" elements yet, i.e. it's still processing meta-characters like
+    // '^'.
+    bool inCharClassEarly = false;
+
     // Location at which the current character class began.
     const u8 *currentClsBegin = p;
 
diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index 72fc2146..cdfc974a 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
         } else {
             return CLASS_UCP_LL;
         }
+    case CLASS_PRINT:
+        return CLASS_XPRINT;
+    case CLASS_PUNCT:
+        return CLASS_XPUNCT;
     case CLASS_SPACE:
         return CLASS_UCP_XPS;
     case CLASS_UPPER:
@@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
     }
 }
 
-static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                        const ParseMode &mode) {
     /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@@ -117,6 +120,22 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
         rv |= cf;
         return rv;
     }
+    case CLASS_XPRINT: {
+        // Same as graph, plus everything with the Zs property.
+        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+        rv |= getUcpZs();
+        rv.set(0x180e); // Also included in this class by PCRE 8.38.
+        return rv;
+    }
+    case CLASS_XPUNCT: {
+        // Everything with the P (punctuation) property, plus code points in S
+        // (symbols) that are < 128.
+        CodePointSet rv = getUcpP();
+        CodePointSet symbols = getUcpS();
+        symbols.unsetRange(128, MAX_UNICODE);
+        rv |= symbols;
+        return rv;
+    }
     case CLASS_HORZ: {
         CodePointSet rv;
         rv.set(0x0009); /* Horizontal tab */
@@ -484,7 +503,8 @@ UTF8ComponentClass *UTF8ComponentClass::clone() const {
 }
 
 bool UTF8ComponentClass::class_empty(void) const {
-    return cps.none() && cps_ucp.none();
+    assert(finalized);
+    return cps.none();
 }
 
 void UTF8ComponentClass::createRange(unichar to) {
@@ -492,16 +512,16 @@ void UTF8ComponentClass::createRange(unichar to) {
     unichar from = range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        CodePointSet ncps;
-        ncps.setRange(from, to);
-        if (mode.caseless) {
-            make_caseless(&ncps);
-        }
-        cps |= ncps;
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CodePointSet ncps;
+    ncps.setRange(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncps);
+    }
+    cps |= ncps;
+    range_start = INVALID_UNICODE;
 }
 
 void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
@@ -520,11 +540,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
         pcps.flip();
     }
 
-    if (isUcp(c)) {
-        cps_ucp |= pcps;
-    } else {
-        cps |= pcps;
-    }
+    cps |= pcps;
 
     range_start = INVALID_UNICODE;
     in_cand_range = false;
@@ -562,8 +578,6 @@ void UTF8ComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    cps |= cps_ucp; /* characters from ucp props always case sensitive */
-
     if (m_negate) {
         cps.flip();
     }
@@ -571,31 +585,6 @@ void UTF8ComponentClass::finalize() {
     finalized = true;
 }
 
-bool isUcp(PredefinedClass c) {
-    switch (c) {
-    case CLASS_ALNUM:
-    case CLASS_ALPHA:
-    case CLASS_ANY:
-    case CLASS_ASCII:
-    case CLASS_BLANK:
-    case CLASS_CNTRL:
-    case CLASS_DIGIT:
-    case CLASS_GRAPH:
-    case CLASS_HORZ:
-    case CLASS_LOWER:
-    case CLASS_PRINT:
-    case CLASS_PUNCT:
-    case CLASS_SPACE:
-    case CLASS_UPPER:
-    case CLASS_VERT:
-    case CLASS_WORD:
-    case CLASS_XDIGIT:
-        return false;
-    default:
-        return true;
-    }
-}
-
 Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
     map<u8, Position>::const_iterator it = heads.find(first_byte);
     if (it != heads.end()) {
diff --git a/src/parser/Utf8ComponentClass.h b/src/parser/Utf8ComponentClass.h
index b2c402f9..f4e7ea32 100644
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@@ -93,7 +93,6 @@ private:
     void buildFourByte(GlushkovBuildState &bs);
 
     CodePointSet cps;
-    CodePointSet cps_ucp;
 
     std::map<u8, Position> heads;
     Position single_pos;
@@ -108,7 +107,9 @@ private:
 };
 
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
-bool isUcp(PredefinedClass c);
+
+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+                                       const ParseMode &mode);
 
 } // namespace
 
diff --git a/src/parser/check_refs.cpp b/src/parser/check_refs.cpp
index 6307a06a..ad81ae76 100644
--- a/src/parser/check_refs.cpp
+++ b/src/parser/check_refs.cpp
@@ -57,7 +57,7 @@ public:
     ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
         : num_ids(num_groups), names(targets) {}
 
-    ~ReferenceVisitor();
+    ~ReferenceVisitor() override;
 
     void invalid_index(const char *component, unsigned id) {
         assert(component);
diff --git a/src/parser/prefilter.cpp b/src/parser/prefilter.cpp
index de99d9f1..f5a0c66c 100644
--- a/src/parser/prefilter.cpp
+++ b/src/parser/prefilter.cpp
@@ -201,7 +201,7 @@ const ComponentSequence *findCapturingGroup(const Component *root,
 class PrefilterVisitor : public DefaultComponentVisitor {
 public:
     PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
-    ~PrefilterVisitor();
+    ~PrefilterVisitor() override;
 
     /** \brief Calls the visitor (recursively) on a new replacement component
      * we've just created. Takes care of freeing it if the sequence is itself
diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp
index d2b77dc5..f6f5d383 100644
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@@ -64,7 +64,7 @@ namespace ue2 {
  */
 class ConstructLiteralVisitor : public ConstComponentVisitor {
 public:
-    ~ConstructLiteralVisitor();
+    ~ConstructLiteralVisitor() override;
 
     /** \brief Thrown if this component does not represent a literal. */
     struct NotLiteral {};
diff --git a/src/parser/unsupported.cpp b/src/parser/unsupported.cpp
index b0b15d8d..c97a5750 100644
--- a/src/parser/unsupported.cpp
+++ b/src/parser/unsupported.cpp
@@ -44,7 +44,7 @@ namespace ue2 {
  * an unsupported component. */
 class UnsupportedVisitor : public DefaultConstComponentVisitor {
 public:
-    ~UnsupportedVisitor();
+    ~UnsupportedVisitor() override;
     void pre(const ComponentAssertion &) override {
         throw ParseError("Zero-width assertions are not supported.");
     }
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index b9fcd784..77b12b49 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -379,7 +379,7 @@ void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) {
     DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi);
     if (final_loc >= q_last_loc(q)) {
         /* TODO: ensure situation does not arise */
-        assert(q->items[q->end - 1].type != MQE_END);
+        assert(q_last_type(q) != MQE_END);
         pushQueueNoMerge(q, MQE_END, final_loc);
     }
 }
diff --git a/src/rose/match.c b/src/rose/match.c
index 32224c24..be9bc35e 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -758,7 +758,7 @@ found_miracle:
 
     q_skip_forward_to(q, miracle_loc);
 
-    if (q->items[q->end - 1].type == MQE_START) {
+    if (q_last_type(q) == MQE_START) {
         DEBUG_PRINTF("miracle caused infix to die\n");
         return 0;
     }
@@ -853,7 +853,7 @@ char roseTestLeftfix(const struct RoseEngine *t, const struct RoseRole *tr,
         }
     }
 
-    if (q_cur_loc(q) < loc || q->items[q->end - 1].type != MQE_START) {
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
         if (left->infix) {
             if (infixTooOld(q, loc)) {
                 DEBUG_PRINTF("infix %u died of old age\n", ri);
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index 1a1fc223..bef2114f 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -42,6 +42,7 @@
 #include "rose_in_graph.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
+#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 
 #include <memory>
@@ -72,8 +73,8 @@ public:
 
     /** \brief True if we can not establish that at most a single callback will
      * be generated at a given offset from this set of reports. */
-    virtual bool requiresDedupeSupport(const std::set<ReportID> &reports) const
-        = 0;
+    virtual bool requiresDedupeSupport(const ue2::flat_set<ReportID> &reports)
+        const = 0;
 };
 
 /** \brief Abstract interface intended for callers from elsewhere in the tree,
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 0c556dfe..675f8c68 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -271,16 +271,13 @@ public:
     typedef Holder_StateSet StateSet;
     typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
 
-    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in), bad(false) {
+    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
         for (auto v : vertices_range(g)) {
             vertexToIndex[v] = indexToVertex.size();
             indexToVertex.push_back(v);
         }
 
-        if (indexToVertex.size() > ANCHORED_NFA_STATE_LIMIT) {
-            bad = true;
-            return;
-        }
+        assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
 
         DEBUG_PRINTF("%zu states\n", indexToVertex.size());
         init.wdelay = 0;
@@ -400,7 +397,6 @@ public:
     array<u16, ALPHABET_SIZE> alpha;
     array<u16, ALPHABET_SIZE> unalpha;
     u16 alphasize;
-    bool bad;
 };
 
 } // namespace
@@ -670,13 +666,13 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
 
 static
 int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
-    Automaton_Holder autom(h);
-
-    if (autom.bad) {
+    if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
         DEBUG_PRINTF("autom bad!\n");
         return ANCHORED_FAIL;
     }
 
+    Automaton_Holder autom(h);
+
     unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
     if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
         return finalise_out(tbi, h, autom, move(out_dfa), remap);
@@ -738,7 +734,6 @@ void buildSimpleDfas(const RoseBuildImpl &tbi,
         NGHolder h;
         populate_holder(simple.first, exit_ids, &h);
         Automaton_Holder autom(h);
-        assert(!autom.bad);
         unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
         UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
         assert(!rv);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index bbc8644e..e17953aa 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2687,12 +2687,6 @@ void fillInReportInfo(RoseEngine *engine, u32 reportOffset,
                  sizeof(internal_report));
 }
 
-static
-void populateInvDkeyTable(char *ptr, const ReportManager &rm) {
-    vector<ReportID> table = rm.getDkeyToReportTable();
-    memcpy(ptr, table.data(), byte_length(table));
-}
-
 static
 bool hasSimpleReports(const vector<Report> &reports) {
     auto it = find_if(reports.begin(), reports.end(), isComplexReport);
@@ -4154,7 +4148,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->ekeyCount = rm.numEkeys();
     engine->dkeyCount = rm.numDkeys();
     engine->invDkeyOffset = dkeyOffset;
-    populateInvDkeyTable(ptr + dkeyOffset, rm);
+    copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());
 
     engine->somHorizon = ssm.somPrecision();
     engine->somLocationCount = ssm.numSomSlots();
@@ -4314,33 +4308,22 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     buildLitBenefits(*this, engine.get(), base_lits_benefits_offset);
 
     // Copy in other tables
-    memcpy(ptr + bc.engine_blob_base, bc.engine_blob.data(),
-           byte_length(bc.engine_blob));
-
-    memcpy(ptr + engine->literalOffset, literalTable.data(),
-           byte_length(literalTable));
-    memcpy(ptr + engine->roleOffset, bc.roleTable.data(),
-           byte_length(bc.roleTable));
-    copy(leftInfoTable.begin(), leftInfoTable.end(),
-         (LeftNfaInfo *)(ptr + engine->leftOffset));
+    copy_bytes(ptr + bc.engine_blob_base, bc.engine_blob);
+    copy_bytes(ptr + engine->literalOffset, literalTable);
+    copy_bytes(ptr + engine->roleOffset, bc.roleTable);
+    copy_bytes(ptr + engine->leftOffset, leftInfoTable);
 
     fillLookaroundTables(ptr + lookaroundTableOffset,
                          ptr + lookaroundReachOffset, bc.lookaround);
 
     fillInSomRevNfas(engine.get(), ssm, rev_nfa_table_offset, rev_nfa_offsets);
-    memcpy(ptr + engine->predOffset, predTable.data(), byte_length(predTable));
-    memcpy(ptr + engine->rootRoleOffset, rootRoleTable.data(),
-           byte_length(rootRoleTable));
-    memcpy(ptr + engine->anchoredReportMapOffset, art.data(), byte_length(art));
-    memcpy(ptr + engine->anchoredReportInverseMapOffset, arit.data(),
-           byte_length(arit));
-    memcpy(ptr + engine->multidirectOffset, mdr_reports.data(),
-           byte_length(mdr_reports));
-
-    copy(activeLeftIter.begin(), activeLeftIter.end(),
-         (mmbit_sparse_iter *)(ptr + engine->activeLeftIterOffset));
-
-    memcpy(ptr + engine->sideOffset, sideTable.data(), byte_length(sideTable));
+    copy_bytes(ptr + engine->predOffset, predTable);
+    copy_bytes(ptr + engine->rootRoleOffset, rootRoleTable);
+    copy_bytes(ptr + engine->anchoredReportMapOffset, art);
+    copy_bytes(ptr + engine->anchoredReportInverseMapOffset, arit);
+    copy_bytes(ptr + engine->multidirectOffset, mdr_reports);
+    copy_bytes(ptr + engine->activeLeftIterOffset, activeLeftIter);
+    copy_bytes(ptr + engine->sideOffset, sideTable);
 
     DEBUG_PRINTF("rose done %p\n", engine.get());
     return engine;
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 34e76269..a2bd971e 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1631,20 +1631,23 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
     assert(left.graph());
     const NGHolder &h = *left.graph();
 
+    ue2::flat_set<NFAVertex> all_states;
+    insert(&all_states, vertices(h));
+    assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
+    DEBUG_PRINTF("removing sds\n");
+    all_states.erase(h.startDs);
+
+    ue2::flat_set<NFAVertex> states;
+
     /* check each pred literal to see if they all kill previous graph
      * state */
     for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
         const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
 
-        set<NFAVertex> states;
-        insert(&states, vertices(h));
-        assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
-        DEBUG_PRINTF("removing sds\n");
-        states.erase(h.startDs);
         DEBUG_PRINTF("running graph %zu\n", states.size());
-        execute_graph(h, s, &states, true);
-        DEBUG_PRINTF("ran\n");
+        states = execute_graph(h, s, all_states, true);
+        DEBUG_PRINTF("ran, %zu states on\n", states.size());
 
         if (!states.empty()) {
             return false;
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 26a7c606..3112d639 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -130,6 +130,8 @@ private:
 
     friend depth findMinWidth(const suffix_id &s);
     friend depth findMaxWidth(const suffix_id &s);
+    friend depth findMinWidth(const suffix_id &s, u32 top);
+    friend depth findMaxWidth(const suffix_id &s, u32 top);
 };
 
 std::set<ReportID> all_reports(const suffix_id &s);
@@ -138,6 +140,8 @@ bool has_eod_accepts(const suffix_id &s);
 bool has_non_eod_accepts(const suffix_id &s);
 depth findMinWidth(const suffix_id &s);
 depth findMaxWidth(const suffix_id &s);
+depth findMinWidth(const suffix_id &s, u32 top);
+depth findMaxWidth(const suffix_id &s, u32 top);
 size_t hash_value(const suffix_id &s);
 
 /** \brief represents an engine to the left of a rose role */
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 9f309985..cc5bbc70 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -77,6 +77,8 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       hasSom(false),
       group_weak_end(0),
       group_end(0),
+      anchored_base_id(MO_INVALID_IDX),
+      nonbenefits_base_id(MO_INVALID_IDX),
       ematcher_region_size(0),
       floating_direct_report(false),
       eod_event_literal_id(MO_INVALID_IDX),
@@ -536,7 +538,7 @@ u32 RoseBuildImpl::getNewLiteralId() {
 }
 
 static
-bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
+bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                     const Grey &grey) {
     /* TODO: tighten */
     NFAVertex seen_vert = NFAGraph::null_vertex();
@@ -579,13 +581,14 @@ bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
 class RoseDedupeAuxImpl : public RoseDedupeAux {
 public:
     explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
-    bool requiresDedupeSupport(const set<ReportID> &reports) const override;
+    bool requiresDedupeSupport(
+        const ue2::flat_set<ReportID> &reports) const override;
 
     const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex> > vert_map;
-    map<ReportID, set<suffix_id> > suffix_map;
-    map<ReportID, set<const OutfixInfo *> > outfix_map;
-    map<ReportID, set<const raw_puff *> > puff_map;
+    map<ReportID, set<RoseVertex>> vert_map;
+    map<ReportID, set<suffix_id>> suffix_map;
+    map<ReportID, set<const OutfixInfo *>> outfix_map;
+    map<ReportID, set<const raw_puff *>> puff_map;
 };
 
 unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
@@ -599,6 +602,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     : tbi(tbi_in) {
     const RoseGraph &g = tbi.g;
 
+    set<suffix_id> suffixes;
+
     for (auto v : vertices_range(g)) {
         // Literals in the small block table don't count as dupes: although
         // they have copies in the anchored table, the two are never run in the
@@ -609,10 +614,16 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
             }
         }
 
+        // Several vertices may share a suffix, so we collect the set of
+        // suffixes first to avoid repeating work.
         if (g[v].suffix) {
-            for (const auto &report_id : all_reports(g[v].suffix)) {
-                suffix_map[report_id].insert(g[v].suffix);
-            }
+            suffixes.insert(g[v].suffix);
+        }
+    }
+
+    for (const auto &suffix : suffixes) {
+        for (const auto &report_id : all_reports(suffix)) {
+            suffix_map[report_id].insert(suffix);
         }
     }
 
@@ -634,8 +645,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     }
 }
 
-bool RoseDedupeAuxImpl::requiresDedupeSupport(const set<ReportID> &reports)
-    const {
+bool RoseDedupeAuxImpl::requiresDedupeSupport(
+    const ue2::flat_set<ReportID> &reports) const {
     /* TODO: this could be expanded to check for offset or character
        constraints */
 
@@ -897,6 +908,17 @@ depth findMinWidth(const suffix_id &s) {
     }
 }
 
+depth findMinWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    if (s.graph()) {
+        return findMinWidth(*s.graph(), top);
+    } else if (s.castle()) {
+        return findMinWidth(*s.castle(), top);
+    } else {
+        return s.dfa_min_width;
+    }
+}
+
 depth findMaxWidth(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
     if (s.graph()) {
@@ -908,6 +930,17 @@ depth findMaxWidth(const suffix_id &s) {
     }
 }
 
+depth findMaxWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    if (s.graph()) {
+        return findMaxWidth(*s.graph(), top);
+    } else if (s.castle()) {
+        return findMaxWidth(*s.castle(), top);
+    } else {
+        return s.dfa_max_width;
+    }
+}
+
 bool has_eod_accepts(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
     if (s.graph()) {
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 87b6936c..88deaa25 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -439,12 +439,16 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
     hash_combine(val, hash_range(begin(props.reports), end(props.reports)));
 
     if (props.suffix) {
-        hash_combine(val, all_reports(props.suffix));
-        if (props.suffix.graph) {
-            hash_combine(val, num_vertices(*props.suffix.graph));
+        const auto &suffix = props.suffix;
+        if (suffix.castle) {
+            hash_combine(val, suffix.castle->reach());
+            hash_combine(val, suffix.castle->repeats.size());
         }
-        if (props.suffix.haig) {
-            hash_combine(val, hash_dfa(*props.suffix.haig));
+        if (suffix.graph) {
+            hash_combine(val, num_vertices(*suffix.graph));
+        }
+        if (suffix.haig) {
+            hash_combine(val, hash_dfa(*suffix.haig));
         }
     }
 
@@ -747,14 +751,17 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
  * Castle. */
 static
 void pruneCastle(CastleProto &castle, ReportID report) {
-    for (map<u32, PureRepeat>::iterator it = castle.repeats.begin();
-         it != castle.repeats.end(); /* incr inside */) {
-        if (contains(it->second.reports, report)) {
-            ++it;
-        } else {
-            castle.repeats.erase(it++);
+    unordered_set<u32> dead; // tops to remove.
+    for (const auto &m : castle.repeats) {
+        if (!contains(m.second.reports, report)) {
+            dead.insert(m.first);
         }
     }
+
+    for (const auto &top : dead) {
+        castle.erase(top);
+    }
+
     assert(!castle.repeats.empty());
 }
 
@@ -794,7 +801,7 @@ void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
     for (u32 top : assoc_keys(castle.repeats)) {
         if (!contains(used_tops, top)) {
             DEBUG_PRINTF("removing unused top %u\n", top);
-            castle.repeats.erase(top);
+            castle.erase(top);
         }
     }
 }
diff --git a/src/rose/rose_build_width.cpp b/src/rose/rose_build_width.cpp
index 2a7f2bd6..6bfcee48 100644
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@@ -94,10 +94,11 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
         }
 
         if (g[v].suffix) {
-            depth suffix_width = findMinWidth(g[v].suffix);
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
             assert(suffix_width.is_reachable());
-            DEBUG_PRINTF("%zu has suffix (width %s), can fire report at %u\n",
-                         g[v].idx, suffix_width.str().c_str(),
+            DEBUG_PRINTF("%zu has suffix with top %u (width %s), can fire "
+                         "report at %u\n",
+                         g[v].idx, g[v].suffix.top, suffix_width.str().c_str(),
                          w + suffix_width);
             minWidth = min(minWidth, w + suffix_width);
         }
@@ -146,8 +147,9 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi) {
             if (has_non_eod_accepts(g[v].suffix)) {
                 return ROSE_BOUND_INF;
             }
-            depth suffix_width = findMaxWidth(g[v].suffix);
-            DEBUG_PRINTF("suffix max width %s\n", suffix_width.str().c_str());
+            depth suffix_width = findMaxWidth(g[v].suffix, g[v].suffix.top);
+            DEBUG_PRINTF("suffix max width for top %u is %s\n", g[v].suffix.top,
+                         suffix_width.str().c_str());
             assert(suffix_width.is_reachable());
             if (!suffix_width.is_finite()) {
                 DEBUG_PRINTF("suffix too wide\n");
diff --git a/src/rose/stream.c b/src/rose/stream.c
index aab79a29..b100eeae 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -167,7 +167,7 @@ found_miracle:
 
         DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc);
         q_skip_forward_to(q, miracle_loc);
-        if (q->items[q->end - 1].type == MQE_START) {
+        if (q_last_type(q) == MQE_START) {
             DEBUG_PRINTF("miracle caused infix to die\n");
             return MIRACLE_DEAD;
         }
diff --git a/src/util/compare.h b/src/util/compare.h
index b7237ec2..11c01f08 100644
--- a/src/util/compare.h
+++ b/src/util/compare.h
@@ -98,18 +98,22 @@ u64a theirtoupper64(const u64a x) {
 static really_inline
 int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) {
     const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (mytolower(*p1) != mytolower(*p2))
+    for (; p1 < pEnd; p1++, p2++) {
+        if (mytolower(*p1) != mytolower(*p2)) {
             return 1;
+        }
+    }
     return 0;
 }
 
 static really_inline
 int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) {
     const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (*p1 != *p2)
+    for (; p1 < pEnd; p1++, p2++) {
+        if (*p1 != *p2) {
             return 1;
+        }
+    }
     return 0;
 }
 
diff --git a/src/util/container.h b/src/util/container.h
index b4a10c89..62e841c1 100644
--- a/src/util/container.h
+++ b/src/util/container.h
@@ -33,8 +33,13 @@
 #ifndef UTIL_CONTAINER_H
 #define UTIL_CONTAINER_H
 
+#include "ue2common.h"
+
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 #include <set>
+#include <type_traits>
 #include <utility>
 
 namespace ue2 {
@@ -92,11 +97,35 @@ std::set<typename C::key_type> assoc_keys(const C &container) {
     return keys;
 }
 
+/**
+ * \brief Return the length in bytes of the given vector of (POD) objects.
+ */
 template<typename T>
 typename std::vector<T>::size_type byte_length(const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
     return vec.size() * sizeof(T);
 }
 
+/**
+ * \brief Copy the given vector of POD objects to the given location in memory.
+ * It is safe to give this function an empty vector.
+ */
+template<typename T>
+void *copy_bytes(void *dest, const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
+    assert(dest);
+
+    // Since we're generally using this function to write into the bytecode,
+    // dest should be appropriately aligned for T.
+    assert(ISALIGNED_N(dest, alignof(T)));
+
+    if (vec.empty()) {
+        return dest; // Protect memcpy against null pointers.
+    }
+    assert(vec.data() != nullptr);
+    return std::memcpy(dest, vec.data(), byte_length(vec));
+}
+
 template<typename OrderedContainer1, typename OrderedContainer2>
 bool is_subset_of(const OrderedContainer1 &small, const OrderedContainer2 &big) {
     static_assert(std::is_same<typename OrderedContainer1::value_type,
diff --git a/src/util/depth.h b/src/util/depth.h
index 29ff2819..977fd0c3 100644
--- a/src/util/depth.h
+++ b/src/util/depth.h
@@ -183,7 +183,7 @@ public:
 
         s64a rv = val + d;
         if (rv < 0 || (u64a)rv >= val_infinity) {
-            DEBUG_PRINTF("depth %llu too large to represent!\n", rv);
+            DEBUG_PRINTF("depth %lld too large to represent!\n", rv);
             throw DepthOverflowError();
         }
 
diff --git a/src/util/multibit.c b/src/util/multibit.c
index ca5e5656..c22b73ff 100644
--- a/src/util/multibit.c
+++ b/src/util/multibit.c
@@ -142,23 +142,25 @@ const u32 mmbit_root_offset_from_level[7] = {
 u32 mmbit_size(u32 total_bits) {
     MDEBUG_PRINTF("%u\n", total_bits);
 
-    // UE-2228: multibit has bugs in very, very large cases that we should be
-    // protected against at compile time by resource limits.
-    assert(total_bits <= 1U << 30);
-
     // Flat model multibit structures are just stored as a bit vector.
     if (total_bits <= MMB_FLAT_MAX_BITS) {
         return ROUNDUP_N(total_bits, 8) / 8;
     }
 
-    u32 current_level = 1;
-    u32 total = 0;
+    u64a current_level = 1; // Number of blocks on current level.
+    u64a total = 0;         // Total number of blocks.
     while (current_level * MMB_KEY_BITS < total_bits) {
         total += current_level;
         current_level <<= MMB_KEY_SHIFT;
     }
-    total += (total_bits + MMB_KEY_BITS - 1)/MMB_KEY_BITS;
-    return sizeof(MMB_TYPE) * total;
+
+    // Last level is a one-for-one bit vector. It needs room for total_bits
+    // elements, rounded up to the nearest block.
+    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
+    total += last_level;
+
+    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
+    return (u32)(total * sizeof(MMB_TYPE));
 }
 
 #ifdef DUMP_SUPPORT
diff --git a/src/util/multibit.h b/src/util/multibit.h
index 251316b3..771c158d 100644
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@@ -235,18 +235,18 @@ const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) {
 /** \brief get the block for this key on the current level as a u8 ptr */
 static really_inline
 u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
-    return mmbit_get_level_root(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    u8 *level_root = mmbit_get_level_root(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 
 /** \brief get the block for this key on the current level as a const u8 ptr */
 static really_inline
 const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level,
                                     u32 key) {
-    return mmbit_get_level_root_const(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    const u8 *level_root = mmbit_get_level_root_const(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 
 /** \brief get the _byte_ for this key on the current level as a u8 ptr */
@@ -254,7 +254,7 @@ static really_inline
 u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
     u8 *level_root = mmbit_get_level_root(bits, level);
     u32 ks = mmbit_get_ks(max_level, level);
-    return level_root + (key >> (ks + MMB_KEY_SHIFT - 3));
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3));
 }
 
 /** \brief get our key value for the current level */
@@ -721,11 +721,11 @@ u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
 }
 
 static really_inline
-MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
-                         u32 block_base) {
+MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
+                         u64a block_base) {
     const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
-    u32 lshift = (block_min - block_base) >> level_shift;
-    u32 ushift = (block_max - block_base) >> level_shift;
+    u64a lshift = (block_min - block_base) >> level_shift;
+    u64a ushift = (block_max - block_base) >> level_shift;
     MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0;
     MMB_TYPE umask =
         ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES;
@@ -734,7 +734,7 @@ MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
 
 static really_inline
 u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
-    u32 key = 0;
+    u64a key = 0;
     u32 ks = mmbit_keyshift(total_bits);
     const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
     u32 level = 0;
@@ -743,9 +743,9 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
         assert(level <= max_level);
 
         u32 block_width = MMB_KEY_BITS << ks;
-        u32 block_base = key*block_width;
-        u32 block_min = MAX(it_start, block_base);
-        u32 block_max = MIN(it_end, block_base + block_width - 1);
+        u64a block_base = key * block_width;
+        u64a block_min = MAX(it_start, block_base);
+        u64a block_max = MIN(it_end, block_base + block_width - 1);
         const u8 *block_ptr =
             mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
         MMB_TYPE block = mmb_load(block_ptr);
@@ -761,13 +761,14 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
             // No bit found, go up a level
             // we know that this block didn't have any answers, so we can push
             // our start iterator forward.
-            it_start = block_base + block_width;
-            if (it_start > it_end) {
+            u64a next_start = block_base + block_width;
+            if (next_start > it_end) {
                 break;
             }
             if (level-- == 0) {
                 break;
             }
+            it_start = next_start;
             key >>= MMB_KEY_SHIFT;
             ks += MMB_KEY_SHIFT;
         }
diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp
index eacef005..425f166d 100644
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@@ -128,11 +128,9 @@ vector<ReportID> ReportManager::getDkeyToReportTable() const {
 }
 
 void ReportManager::assignDkeys(const RoseBuild *rose) {
-    unique_ptr<RoseDedupeAux> dedupe = rose->generateDedupeAux();
-
     DEBUG_PRINTF("assigning...\n");
 
-    map<u32, set<ReportID>> ext_to_int;
+    map<u32, ue2::flat_set<ReportID>> ext_to_int;
 
     for (u32 i = 0; i < reportIds.size(); i++) {
         const Report &ir = reportIds[i];
@@ -143,6 +141,8 @@ void ReportManager::assignDkeys(const RoseBuild *rose) {
         }
     }
 
+    auto dedupe = rose->generateDedupeAux();
+
     for (const auto &m : ext_to_int) {
         u32 ext = m.first;
 
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index 9bc74e23..a8925a3c 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -7,7 +7,8 @@ if(NOT XCODE)
 else()
     set(CMAKE_CXX_FLAGS "-isystem ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CXX_FLAGS}")
 endif()
-include_directories(${CMAKE_SOURCE_DIR}/util)
+
+include_directories(${PROJECT_SOURCE_DIR})
 
 # remove some warnings
 # cmake's scope means these only apply here
@@ -26,7 +27,7 @@ endif()
 
 add_library(gtest ${gtest_SOURCES})
 
-add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${CMAKE_SOURCE_DIR})
+add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 
 if (NOT RELEASE_BUILD)
 set(unit_internal_SOURCES
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 1ad445b3..fb2a2357 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -85,7 +85,6 @@
 84:/[=\]=]/ #Unsupported POSIX collating element at index 0.
 85:/A(?!)+Z/ #Invalid repeat at index 5.
 86:/\X/ #\X unsupported at index 0.
-87:/[a\Qz\E]/ #\Q..\E sequences in character classes not supported at index 2.
 88:/[A-\d]/ #Invalid range in character class at index 3.
 89:/[A-[:digit:]]/ #Invalid range in character class at index 3.
 90:/B[--[:digit:]--]+/ #Invalid range in character class at index 4.
@@ -128,3 +127,8 @@
 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/���������������������������0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
 129:/bignum \1111111111111111111/ #Number is too big at index 7.
 130:/foo|&{5555555,}/ #Bounded repeat is too large.
+131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
+132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
+133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
+134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
+135:/[^\D\d]/8W #Pattern can never match.
diff --git a/unit/internal/multi_bit.cpp b/unit/internal/multi_bit.cpp
index 27725219..3f5c5908 100644
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@@ -363,7 +363,9 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
     ASSERT_TRUE(ba != nullptr);
 
     // Set one bit on and run some checks.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
+        SCOPED_TRACE(i);
+
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
 
@@ -381,7 +383,12 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
 
         // Scanning from one past our bit to the end should find nothing.
         if (i != test_size - 1) {
-            ASSERT_EQ(MMB_INVALID, mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
+            // Ordinary iterator.
+            ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, i));
+
+            // Bounded iterator.
+            ASSERT_EQ(MMB_INVALID,
+                      mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
         }
     }
 }
@@ -393,7 +400,7 @@ TEST_P(MultiBitTest, BoundedIteratorAll) {
     // Switch everything on.
     fill_mmbit(ba, test_size);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         if (i != 0) {
             ASSERT_EQ(0U, mmbit_iterate_bounded(ba, test_size, 0, i));
         }
@@ -408,13 +415,13 @@ TEST_P(MultiBitTest, BoundedIteratorEven) {
 
     // Set every even-numbered bit and see what we can see.
     mmbit_clear(ba, test_size);
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
         mmbit_set(ba, test_size, i);
     }
 
     u32 even_stride = stride % 2 ? stride + 1 : stride;
 
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
         // Scanning from each even bit to the end should find itself.
         ASSERT_EQ(i, mmbit_iterate_bounded(ba, test_size, i, test_size));
 
@@ -439,13 +446,13 @@ TEST_P(MultiBitTest, BoundedIteratorOdd) {
 
     // Set every odd-numbered bit and see what we can see.
     mmbit_clear(ba, test_size);
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
         mmbit_set(ba, test_size, i);
     }
 
     u32 even_stride = stride % 2 ? stride + 1 : stride;
 
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
         // Scanning from each even bit to the end should find i+1.
         if (i+1 < test_size) {
             ASSERT_EQ(i+1, mmbit_iterate_bounded(ba, test_size, i, test_size));
@@ -473,7 +480,7 @@ TEST_P(MultiBitTest, Set) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
 
         // set a bit that wasn't set before
@@ -500,7 +507,7 @@ TEST_P(MultiBitTest, Iter) {
     mmbit_clear(ba, test_size);
     ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -517,13 +524,13 @@ TEST_P(MultiBitTest, IterAll) {
     ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
 
     // Set all bits.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         mmbit_set(ba, test_size, i);
     }
 
     // Find all bits.
     u32 it = MMB_INVALID;
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         ASSERT_EQ(i, mmbit_iterate(ba, test_size, it));
         it = i;
     }
@@ -536,7 +543,7 @@ TEST_P(MultiBitTest, AnyPrecise) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any_precise(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -551,7 +558,7 @@ TEST_P(MultiBitTest, Any) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -567,7 +574,7 @@ TEST_P(MultiBitTest, UnsetRange1) {
     fill_mmbit(ba, test_size);
 
     // Use mmbit_unset_range to switch off any single bit.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_TRUE(mmbit_isset(ba, test_size, i));
         mmbit_unset_range(ba, test_size, i, i + 1);
@@ -590,7 +597,7 @@ TEST_P(MultiBitTest, UnsetRange2) {
     // Use mmbit_unset_range to switch off all bits.
     mmbit_unset_range(ba, test_size, 0, test_size);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_FALSE(mmbit_isset(ba, test_size, i));
     }
@@ -601,12 +608,12 @@ TEST_P(MultiBitTest, UnsetRange3) {
     ASSERT_TRUE(ba != nullptr);
 
     // Use mmbit_unset_range to switch off bits in chunks of 3.
-    for (u32 i = 0; i < test_size - 3; i += stride) {
+    for (u64a i = 0; i < test_size - 3; i += stride) {
         // Switch on the bit before, the bits in question, and the bit after.
         if (i > 0) {
             mmbit_set(ba, test_size, i - 1);
         }
-        for (u32 j = i; j < min(i + 4, test_size); j++) {
+        for (u64a j = i; j < min(i + 4, (u64a)test_size); j++) {
             mmbit_set(ba, test_size, j);
         }
 
@@ -635,7 +642,7 @@ TEST_P(MultiBitTest, InitRangeAll) {
     mmbit_init_range(ba, test_size, 0, test_size);
 
     // Make sure they're all set.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_TRUE(mmbit_isset(ba, test_size, i));
     }
@@ -656,7 +663,7 @@ TEST_P(MultiBitTest, InitRangeOne) {
     SCOPED_TRACE(test_size);
     ASSERT_TRUE(ba != nullptr);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         mmbit_init_range(ba, test_size, i, i + 1);
 
         // Only bit 'i' should be on.
@@ -685,7 +692,7 @@ TEST_P(MultiBitTest, InitRangeChunked) {
             ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));
 
             // All bits in the chunk should be on.
-            for (u32 i = chunk_begin; i < chunk_end; i += stride) {
+            for (u64a i = chunk_begin; i < chunk_end; i += stride) {
                 SCOPED_TRACE(i);
                 ASSERT_TRUE(mmbit_isset(ba, test_size, i));
             }
@@ -985,7 +992,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1032,7 +1039,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
     // Switch every third bits on in state
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += 3) {
+    for (u64a i = 0; i < test_size; i += 3) {
         mmbit_set(ba, test_size, i);
     }
 
@@ -1044,7 +1051,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
     ASSERT_EQ(0U, val);
     ASSERT_EQ(0U, idx);
 
-    for (u32 i = 0; i < test_size - 3; i += 3) {
+    for (u64a i = 0; i < test_size - 3; i += 3) {
         mmbit_unset(ba, test_size, i);
         val = mmbit_sparse_iter_begin(ba, test_size, &idx, &it[0], &state[0]);
         ASSERT_EQ(i+3, val);
@@ -1060,7 +1067,7 @@ TEST_P(MultiBitTest, SparseIteratorNextAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1103,7 +1110,7 @@ TEST_P(MultiBitTest, SparseIteratorNextExactStrided) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
         mmbit_set(ba, test_size, i);
     }
@@ -1135,7 +1142,7 @@ TEST_P(MultiBitTest, SparseIteratorNextNone) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1164,7 +1171,7 @@ TEST_P(MultiBitTest, SparseIteratorUnsetAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1194,10 +1201,10 @@ TEST_P(MultiBitTest, SparseIteratorUnsetHalves) {
 
     // Two sparse iterators: one for even bits, one for odd ones
     vector<u32> even, odd;
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
         even.push_back(i);
     }
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
         odd.push_back(i);
     }
 
@@ -1277,9 +1284,9 @@ static const MultiBitTestParam multibitTests[] = {
     { 1U << 28, 15073 },
     { 1U << 29, 24413 },
     { 1U << 30, 50377 },
+    { 1U << 31, 104729 },
 
-    // XXX: cases this large segfault in mmbit_set, FIXME NOW
-    //{ 1U << 31, 3701 },
+    // { UINT32_MAX, 104729 }, // Very slow
 };
 
 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));
diff --git a/unit/internal/nfagraph_find_matches.cpp b/unit/internal/nfagraph_find_matches.cpp
index 41d28c52..553d6dc5 100644
--- a/unit/internal/nfagraph_find_matches.cpp
+++ b/unit/internal/nfagraph_find_matches.cpp
@@ -36,9 +36,9 @@
 #include "nfagraph/ng_builder.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_asserts.h"
-#include "util/target_info.h"
 #include "hs_compile.h"
-#include "ng_find_matches.h"
+#include "util/ng_find_matches.h"
+#include "util/target_info.h"
 
 using namespace std;
 using namespace testing;
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index 95408f28..94f1bdc1 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -448,6 +448,25 @@ TEST_P(RepeatTest, Pack) {
     }
 }
 
+TEST_P(RepeatTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << info);
+
+    if (info.repeatMax == REPEAT_INF) {
+        return; // Test not valid for FIRST-type repeats.
+    }
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(&info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(&info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        repeatStore(&info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(&info, ctrl, state));
+    }
+}
+
 static
 const u32 sparsePeriods[] = {
     2,
@@ -505,6 +524,7 @@ const RepeatTestInfo sparseRepeats[] = {
     { REPEAT_SPARSE_OPTIMAL_P, 4000, 4000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4500, 4500 },
     { REPEAT_SPARSE_OPTIMAL_P, 5000, 5000 },
+    { REPEAT_SPARSE_OPTIMAL_P, 65534, 65534 },
     // {N, M} repeats
     { REPEAT_SPARSE_OPTIMAL_P, 10, 20 },
     { REPEAT_SPARSE_OPTIMAL_P, 20, 40 },
@@ -528,7 +548,8 @@ const RepeatTestInfo sparseRepeats[] = {
     { REPEAT_SPARSE_OPTIMAL_P, 3500, 4000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4000, 8000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4500, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 }
+    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 },
+    { REPEAT_SPARSE_OPTIMAL_P, 60000, 65534 }
 };
 
 static
@@ -802,7 +823,7 @@ TEST_P(SparseOptimalTest, Simple1) {
                                   1000 + info->repeatMax * 2));
     ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state,
                                   1000 + info->repeatMax * 2 + 1));
-    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 10000));
+    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 100000));
 }
 
 TEST_P(SparseOptimalTest, TwoTopsNeg) {
@@ -893,6 +914,24 @@ TEST_P(SparseOptimalTest, Simple3e) {
     test_sparse3entryExpire(info, ctrl, state, 2 * info->minPeriod - 1);
 }
 
+TEST_P(SparseOptimalTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        if (top2 - top1 < info->minPeriod) {
+            continue; // not a valid top
+        }
+        repeatStore(info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(info, ctrl, state));
+    }
+}
+
 TEST_P(SparseOptimalTest, ThreeTops) {
     SCOPED_TRACE(testing::Message() << "Repeat: " << *info);