From a6383a54a42847fe5c23682898ebf026c40c421f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 30 Oct 2015 09:33:04 +1100
Subject: [PATCH 01/54] assignStringsToBuckets: assert that there are lits

---
 src/fdr/fdr_compile.cpp | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8d658ccd..8f1f3e03 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -245,6 +245,8 @@ void FDRCompiler::assignStringsToBuckets() {
     typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
 
     u32 ls = verify_u32(lits.size());
+    assert(ls); // Shouldn't be called with no literals.
+
     // make a vector that contains our literals as pointers or u32 LiteralIndex values
     vector<LiteralIndex> vli;
     vli.resize(ls);
@@ -292,6 +294,8 @@ void FDRCompiler::assignStringsToBuckets() {
             currentChunk++;
         }
     }
+
+    assert(currentChunk > 0);
     count[currentChunk - 1] = ls - chunkStartID;
     // close off chunks with an empty row
     firstIds[currentChunk] = ls;

From cea914e18e32a4c35421b94dc8cc133e5b65d57f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 30 Oct 2015 09:43:28 +1100
Subject: [PATCH 02/54] Add q_last_type() queue function

Analogous to q_cur_type(), asserts that queue indices are within a valid
range.
---
 src/nfa/nfa_api_queue.h | 8 ++++++++
 src/rose/catchup.c      | 2 +-
 src/rose/match.c        | 4 ++--
 src/rose/stream.c       | 2 +-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index d8079292..75c15cce 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -196,6 +196,14 @@ static really_inline s64a q_cur_loc(const struct mq *q) {
     return q->items[q->cur].location;
 }
 
+/** \brief Returns the type of the last event in the queue. */
+static really_inline u32 q_last_type(const struct mq *q) {
+    assert(q->cur < q->end);
+    assert(q->end > 0);
+    assert(q->end <= MAX_MQE_LEN);
+    return q->items[q->end - 1].type;
+}
+
 /** \brief Returns the location (relative to the beginning of the current data
  * buffer) of the last event in the queue. */
 static really_inline s64a q_last_loc(const struct mq *q) {
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index b9fcd784..77b12b49 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -379,7 +379,7 @@ void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) {
     DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi);
     if (final_loc >= q_last_loc(q)) {
         /* TODO: ensure situation does not arise */
-        assert(q->items[q->end - 1].type != MQE_END);
+        assert(q_last_type(q) != MQE_END);
         pushQueueNoMerge(q, MQE_END, final_loc);
     }
 }
diff --git a/src/rose/match.c b/src/rose/match.c
index 32224c24..be9bc35e 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -758,7 +758,7 @@ found_miracle:
 
     q_skip_forward_to(q, miracle_loc);
 
-    if (q->items[q->end - 1].type == MQE_START) {
+    if (q_last_type(q) == MQE_START) {
         DEBUG_PRINTF("miracle caused infix to die\n");
         return 0;
     }
@@ -853,7 +853,7 @@ char roseTestLeftfix(const struct RoseEngine *t, const struct RoseRole *tr,
         }
     }
 
-    if (q_cur_loc(q) < loc || q->items[q->end - 1].type != MQE_START) {
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
         if (left->infix) {
             if (infixTooOld(q, loc)) {
                 DEBUG_PRINTF("infix %u died of old age\n", ri);
diff --git a/src/rose/stream.c b/src/rose/stream.c
index aab79a29..b100eeae 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -167,7 +167,7 @@ found_miracle:
 
         DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc);
         q_skip_forward_to(q, miracle_loc);
-        if (q->items[q->end - 1].type == MQE_START) {
+        if (q_last_type(q) == MQE_START) {
             DEBUG_PRINTF("miracle caused infix to die\n");
             return MIRACLE_DEAD;
         }

From da2386585da4f7e72e359a1334ce4b78c8892caa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 30 Oct 2015 09:52:49 +1100
Subject: [PATCH 03/54] Init filter members to nullptr

Note that BGL filters must be default-constructible.
---
 src/nfagraph/ng_depth.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/nfagraph/ng_depth.cpp b/src/nfagraph/ng_depth.cpp
index 719c7654..d7945be9 100644
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -55,14 +55,14 @@ namespace ue2 {
 namespace {
 
 /** Distance value used to indicate that the vertex can't be reached. */
-static const int DIST_UNREACHABLE = INT_MAX;
+static constexpr int DIST_UNREACHABLE = INT_MAX;
 
 /**
  * Distance value used to indicate that the distance to a vertex is infinite
  * (for example, it's the max distance and there's a cycle in the path) or so
  * large that we should consider it effectively infinite.
  */
-static const int DIST_INFINITY = INT_MAX - 1;
+static constexpr int DIST_INFINITY = INT_MAX - 1;
 
 //
 // Filters
@@ -71,10 +71,12 @@ static const int DIST_INFINITY = INT_MAX - 1;
 template <class GraphT>
 struct NodeFilter {
     typedef typename GraphT::edge_descriptor EdgeT;
-    NodeFilter() { }
+    NodeFilter() {} // BGL filters must be default-constructible.
     NodeFilter(const vector<bool> *bad_in, const GraphT *g_in)
         : bad(bad_in), g(g_in) { }
     bool operator()(const EdgeT &e) const {
+        assert(g && bad);
+
         u32 src_idx = (*g)[source(e, *g)].index;
         u32 tar_idx = (*g)[target(e, *g)].index;
 
@@ -84,16 +86,20 @@ struct NodeFilter {
 
         return !(*bad)[src_idx] && !(*bad)[tar_idx];
     }
-    const vector<bool> *bad;
-    const GraphT *g;
+
+private:
+    const vector<bool> *bad = nullptr;
+    const GraphT *g = nullptr;
 };
 
 template <class GraphT>
 struct StartFilter {
     typedef typename GraphT::edge_descriptor EdgeT;
-    StartFilter() { }
+    StartFilter() {} // BGL filters must be default-constructible.
     explicit StartFilter(const GraphT *g_in) : g(g_in) { }
     bool operator()(const EdgeT &e) const {
+        assert(g);
+
         u32 src_idx = (*g)[source(e, *g)].index;
         u32 tar_idx = (*g)[target(e, *g)].index;
 
@@ -107,7 +113,9 @@ struct StartFilter {
         }
         return true;
     }
-    const GraphT *g;
+
+private:
+    const GraphT *g = nullptr;
 };
 
 } // namespace

From 447753f1481ad15c06ee593a04f23d9c2a8b8ba9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 30 Oct 2015 15:01:20 +1100
Subject: [PATCH 04/54] FDR compiler: assert that all models are < 32 bits

---
 src/fdr/fdr_compile.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8f1f3e03..8be44370 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -387,12 +387,14 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
                                const vector<hwlmLiteral> &lits,
                                SuffixPositionInString pos,
                                std::map<u32, ue2::unordered_set<u32> > &m2) {
+    assert(eng.bits < 32);
+
     u32 distance = 0;
     if (eng.bits <= 8) {
         distance = 1;
     } else if (eng.bits <= 16) {
         distance = 2;
-    } else if (eng.bits <= 32) {
+    } else {
         distance = 4;
     }
 

From c7bebf8836e539c5f21f001a0d3f417c2975a290 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 30 Oct 2015 15:10:03 +1100
Subject: [PATCH 05/54] RoseBuildImpl: init base_id members

These are set late in the Rose build process, when final IDs are
allocated.
---
 src/rose/rose_build_misc.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 9f309985..f627017c 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -77,6 +77,8 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       hasSom(false),
       group_weak_end(0),
       group_end(0),
+      anchored_base_id(MO_INVALID_IDX),
+      nonbenefits_base_id(MO_INVALID_IDX),
       ematcher_region_size(0),
       floating_direct_report(false),
       eod_event_literal_id(MO_INVALID_IDX),

From 7b6ad2a01a6966f94332854c877d35511d421a79 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 30 Oct 2015 16:20:18 +1100
Subject: [PATCH 06/54] doComponent: make it obvious that a is never null

---
 src/nfagraph/ng_puff.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index cd190171..501d8f7b 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -316,7 +316,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
     bool unbounded = false;
     bool exhaustible = can_exhaust(g, rm);
 
-    while (a) {
+    while (true) {
         if (is_special(a, g)) {
             DEBUG_PRINTF("stopped puffing due to special vertex\n");
             break;
@@ -350,9 +350,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
 
         a = getSoleSourceVertex(g, a);
 
-        if (!a) {
-            break;
-        }
+        assert(a); /* already checked that old a had a proper in degree of 1 */
 
         // Snark: we can't handle this case, because we can only handle a
         // single report ID on a vertex

From a255e6b67802a05a33acfde43f71a9b1824f8b81 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 2 Nov 2015 10:28:31 +1100
Subject: [PATCH 07/54] add asserts to make bounds on alphaShift clear

---
 src/nfa/mcclellancompile.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 39d29359..f75d08b5 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -700,7 +700,10 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
     ReportID arb;
     u8 single;
     u32 accelCount;
+
     u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
     u16 count_real_states;
     if (allocateFSN16(info, &count_real_states)) {
         DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
@@ -843,6 +846,7 @@ void fillInBasicState8(const dfa_info &info, mstate_aux *aux, u8 *succ_table,
                        const vector<u32> &reports_eod, u32 i) {
     dstate_id_t j = info.implId(i);
     u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
 
     for (size_t s = 0; s < info.impl_alpha_size; s++) {
         dstate_id_t raw_succ = info.states[i].next[s];

From 510e999738c2ae755b01e7300c19cac09f689bcc Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 2 Nov 2015 11:22:51 +1100
Subject: [PATCH 08/54] make Automaton_Base ctor protected

Makes explicit that Automaton_Base is intended to be used as a only base class
---
 src/nfagraph/ng_haig.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 7681fa14..78c615c9 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -114,7 +114,7 @@ void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) {
 }
 
 class Automaton_Base {
-public:
+protected:
     Automaton_Base(const NGHolder &graph_in,
                    const ue2::unordered_map<NFAVertex, u32> &state_ids_in)
         : graph(graph_in), state_ids(state_ids_in) {
@@ -122,6 +122,7 @@ public:
         assert(alphasize <= ALPHABET_SIZE);
     }
 
+public:
     static bool canPrune(const flat_set<ReportID> &) { return false; }
 
     const NGHolder &graph;

From 01498fa8a577625614b2fa57dbb792849706e792 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 2 Nov 2015 10:20:37 +1100
Subject: [PATCH 09/54] LimEx NFA: no need to zero init cached_esucc

All of the "exception cache" members are guarded by cached_esucc.
---
 src/nfa/limex_runtime_impl.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 611fe747..703a7d19 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -542,7 +542,6 @@ char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) {
     ctx->callback = q->cb;
     ctx->context = q->context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     assert(q->items[q->cur].location >= 0);
     DEBUG_PRINTF("LOAD STATE\n");
@@ -638,7 +637,6 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
     ctx->callback = q->cb;
     ctx->context = q->context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     DEBUG_PRINTF("LOAD STATE\n");
     STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@@ -730,7 +728,6 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
     ctx->callback = NULL;
     ctx->context = NULL;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     DEBUG_PRINTF("LOAD STATE\n");
     STORE_STATE(&ctx->s, LOAD_STATE(q->state));
@@ -833,7 +830,6 @@ char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset,
     ctx->callback = cb;
     ctx->context = context;
     STORE_STATE(&ctx->cached_estate, ZERO_STATE);
-    STORE_STATE(&ctx->cached_esucc, ZERO_STATE);
 
     const IMPL_NFA_T *limex = getImplNfa(n);
     STORE_STATE(&ctx->s, INITIAL_FN(limex, 0)); // always anchored

From b5e290e98508c58d699bbc957e44ef2b0bfe315d Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 2 Nov 2015 11:57:55 +1100
Subject: [PATCH 10/54] LimEx NFA: no need to zero estate cache in STREAM

We believe that we have solved the issues that required zeroing of the
exception state in STREAM_FN and REV_STREAM_FN nowadays.
---
 src/nfa/limex_exceptional.h  | 3 ++-
 src/nfa/limex_native.c       | 3 ++-
 src/nfa/limex_runtime_impl.h | 2 --
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index dcc302d4..26c5e5a5 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -218,7 +218,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
     if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
         DEBUG_PRINTF("using cached succ from previous state\n");
         STORE_STATE(succ, OR_STATE(LOAD_STATE(succ), LOAD_STATE(&ctx->cached_esucc)));
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
             if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                          ctx->context, offset)
                         == MO_HALT_MATCHING)) {
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 675c18bd..49c10199 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -83,7 +83,8 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
     if (estate == ctx->cached_estate) {
         DEBUG_PRINTF("using cached succ from previous state\n");
         *succ |= ctx->cached_esucc;
-        if (ctx->cached_reports) {
+        if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
+            DEBUG_PRINTF("firing cached reports from previous state\n");
             if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback,
                                          ctx->context, offset)
                         == MO_HALT_MATCHING)) {
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 703a7d19..6ef3bae9 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -179,7 +179,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     assert(ISALIGNED_CL(ctx));
     assert(ISALIGNED_CL(&ctx->s));
     STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */
@@ -305,7 +304,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     const ReportID *exReports = getExReports(limex);
     const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
-    STORE_STATE(&ctx->cached_estate, ZERO_STATE); /* TODO: understand why this is required */
 
     /* assert(ISALIGNED_16(exceptions)); */
     /* assert(ISALIGNED_16(reach)); */

From 4311775b436430a9f5a724eee2a78d2fc8101e57 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 2 Nov 2015 12:00:09 +1100
Subject: [PATCH 11/54] LimEx NFA: unify flush br/estate behaviour

Make the GPR NFA models only clear cached_estate conditionally based on
cached_br, as per the SIMD models.
---
 src/nfa/limex_native.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 49c10199..471e4bf0 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -120,7 +120,9 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
         ctx->cached_reports = new_cache.reports;
         ctx->cached_br = new_cache.br;
     } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
-        ctx->cached_estate = 0U;
+        if (ctx->cached_br) {
+            ctx->cached_estate = 0U;
+        }
     }
 
     return 0;

From 89660e30b6fad4dd67cf07478e3498eae6f2100d Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 2 Nov 2015 13:35:04 +1100
Subject: [PATCH 12/54] raw_som_dfa: initialize members in constructor

---
 src/nfa/goughcompile.cpp |  2 +-
 src/nfa/goughcompile.h   |  7 +++++--
 src/nfagraph/ng_haig.cpp | 11 +++++------
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 92d09aa5..d2de7b95 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -1109,7 +1109,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     u32 total_prog_size = byte_length(temp_blocks);
     curr_offset += total_prog_size;
 
-    gi.stream_som_loc_count =  slot_count;
+    gi.stream_som_loc_count = slot_count;
     gi.stream_som_loc_width = somPrecision;
 
     u32 gough_size = ROUNDUP_N(curr_offset, 16);
diff --git a/src/nfa/goughcompile.h b/src/nfa/goughcompile.h
index b575c75b..9da983d4 100644
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@@ -70,8 +70,11 @@ struct dstate_som {
 };
 
 struct raw_som_dfa : public raw_dfa {
-    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in)
-        : raw_dfa(k), unordered_som_triggers(unordered_som_triggers_in) {
+    raw_som_dfa(nfa_kind k, bool unordered_som_triggers_in, u32 trigger,
+                u32 stream_som_loc_width_in)
+        : raw_dfa(k), stream_som_loc_width(stream_som_loc_width_in),
+        unordered_som_triggers(unordered_som_triggers_in),
+        trigger_nfa_state(trigger) {
         assert(!unordered_som_triggers || is_triggered(kind));
     }
 
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 78c615c9..d06083bd 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -609,7 +609,6 @@ bool doHaig(const NGHolder &g,
     }
 
     haig_note_starts(g, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;
 
     return true;
 }
@@ -639,7 +638,8 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
         return nullptr;
     }
 
-    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
+                                              somPrecision);
 
     DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
     bool rv;
@@ -659,7 +659,6 @@ unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
 
     DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n",
                  rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = somPrecision;
 
     assert(rdfa->kind == g.kind);
     return rdfa;
@@ -783,7 +782,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
 
     typedef Automaton_Haig_Merge::StateSet StateSet;
     vector<StateSet> nfa_state_map;
-    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som);
+    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
+                                              NODE_START,
+                                              dfas[0]->stream_som_loc_width);
 
     int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
     if (rv) {
@@ -831,11 +832,9 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
     }
 
     haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states);
-    rdfa->trigger_nfa_state = NODE_START;
 
     DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
                  rdfa->start_anchored, rdfa->start_floating);
-    rdfa->stream_som_loc_width = dfas[0]->stream_som_loc_width;
 
     return rdfa;
 }

From 1507b3fd36872f666af077dadd6fd79c235ebdfd Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 2 Nov 2015 14:36:43 +1100
Subject: [PATCH 13/54] move oversize graph check out of Automaton_holder ctor

---
 src/rose/rose_build_anchored.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 0c556dfe..675f8c68 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -271,16 +271,13 @@ public:
     typedef Holder_StateSet StateSet;
     typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
 
-    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in), bad(false) {
+    explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
         for (auto v : vertices_range(g)) {
             vertexToIndex[v] = indexToVertex.size();
             indexToVertex.push_back(v);
         }
 
-        if (indexToVertex.size() > ANCHORED_NFA_STATE_LIMIT) {
-            bad = true;
-            return;
-        }
+        assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
 
         DEBUG_PRINTF("%zu states\n", indexToVertex.size());
         init.wdelay = 0;
@@ -400,7 +397,6 @@ public:
     array<u16, ALPHABET_SIZE> alpha;
     array<u16, ALPHABET_SIZE> unalpha;
     u16 alphasize;
-    bool bad;
 };
 
 } // namespace
@@ -670,13 +666,13 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
 
 static
 int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
-    Automaton_Holder autom(h);
-
-    if (autom.bad) {
+    if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
         DEBUG_PRINTF("autom bad!\n");
         return ANCHORED_FAIL;
     }
 
+    Automaton_Holder autom(h);
+
     unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
     if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
         return finalise_out(tbi, h, autom, move(out_dfa), remap);
@@ -738,7 +734,6 @@ void buildSimpleDfas(const RoseBuildImpl &tbi,
         NGHolder h;
         populate_holder(simple.first, exit_ids, &h);
         Automaton_Holder autom(h);
-        assert(!autom.bad);
         unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX);
         UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
         assert(!rv);

From e8bfe5478b9073a135e42d2c748db83cd88b27cf Mon Sep 17 00:00:00 2001
From: Xiang Wang <xiang.w.wang@intel.com>
Date: Thu, 29 Oct 2015 06:43:47 -0400
Subject: [PATCH 14/54] Optimize max clique analysis

Use vectors of state ids to avoid the overhead of subgraph copies
---
 src/nfa/castlecompile.cpp | 164 ++++++++++++++++----------------------
 1 file changed, 70 insertions(+), 94 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 5ea0e873..e7e40540 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -145,7 +145,6 @@ struct CliqueVertexProps {
     u32 stateId = ~0U;
     u32 parentId = ~0U;
     bool leftChild = false; /* tells us if it is the left child of its parent */
-    bool rightChildVisited = false; /* tells us if its right child is visited */
 
     vector<u32> clique1; /* clique for the left branch */
     vector<u32> indepSet1; /* independent set for the left branch */
@@ -188,56 +187,6 @@ unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
     return cg;
 }
 
-static
-CliqueGraph createSubgraph(const CliqueGraph &cg,
-                           const vector<CliqueVertex> &vertices) {
-    CliqueGraph g;
-    map<u32, CliqueVertex> vertexMap;
-    for (auto u : vertices) {
-        u32 id = cg[u].stateId;
-        CliqueVertex v = add_vertex(CliqueVertexProps(id), g);
-        vertexMap[id] = v;
-    }
-
-    set<u32> found;
-    for (auto u : vertices) {
-        u32 srcId = cg[u].stateId;
-        CliqueVertex src = vertexMap[srcId];
-        found.insert(srcId);
-        for (auto n : adjacent_vertices_range(u, cg)) {
-            u32 dstId = cg[n].stateId;
-            if (found.find(dstId) == found.end() &&
-                vertexMap.find(dstId) != vertexMap.end()) {
-                CliqueVertex dst = vertexMap[dstId];
-                add_edge(src, dst, g);
-            }
-        }
-    }
-    return g;
-}
-
-static
-void getNeighborInfo(const CliqueGraph &g, vector<CliqueVertex> &neighbor,
-                     vector<CliqueVertex> &nonneighbor,
-                     const CliqueVertex &cv) {
-    u32 id = g[cv].stateId;
-    ue2::unordered_set<u32> neighborId;
-
-    // find neighbors for cv
-    for (auto v : adjacent_vertices_range(cv, g)) {
-        neighbor.push_back(v);
-        neighborId.insert(g[v].stateId);
-    }
-
-    // find non-neighbors for cv
-    for (auto v : vertices_range(g)) {
-        if (g[v].stateId != id &&
-            neighborId.find(g[v].stateId) == neighborId.end()) {
-            nonneighbor.push_back(v);
-        }
-    }
-}
-
 static
 void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
                       vector<u32> &clique, vector<u32> &indepSet) {
@@ -257,76 +206,103 @@ void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
     }
 }
 
+static
+void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
+                     vector<u32> &nonneighbor, const CliqueVertex &cv,
+                     const set<u32> &group) {
+    u32 id = g[cv].stateId;
+    ue2::unordered_set<u32> neighborId;
+
+    // find neighbors for cv
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)) {
+            neighbor.push_back(g[v].stateId);
+            neighborId.insert(g[v].stateId);
+        }
+    }
+
+    neighborId.insert(id);
+    // find non-neighbors for cv
+    for (const auto &v : vertices_range(g)) {
+        if (!contains(neighborId, g[v].stateId) &&
+            contains(group, g[v].stateId)) {
+            nonneighbor.push_back(g[v].stateId);
+        }
+    }
+}
+
 static
 void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique,
                      vector<u32> &indepSet) {
-    stack<CliqueGraph> gStack;
-    gStack.push(cg);
+    stack<vector<u32>> gStack;
 
     // create mapping between vertex and id
     map<u32, CliqueVertex> vertexMap;
-    for (auto v : vertices_range(cg)) {
+    vector<u32> init;
+    for (auto &v : vertices_range(cg)) {
         vertexMap[cg[v].stateId] = v;
+        init.push_back(cg[v].stateId);
     }
+    gStack.push(init);
 
     // get the vertex to start from
-    ue2::unordered_set<u32> foundVertexId;
+    set<u32> foundVertexId;
+    ue2::unordered_set<u32> visitedId;
     CliqueGraph::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(cg);
     CliqueVertex start = *vi;
     u32 startId = cg[start].stateId;
-
+    DEBUG_PRINTF("startId:%u\n", startId);
     bool leftChild = false;
     u32 prevId = startId;
     while (!gStack.empty()) {
-        CliqueGraph g = gStack.top();
-        gStack.pop();
+        const auto &g = gStack.top();
 
         // choose a vertex from the graph
-        tie(vi, ve) = vertices(g);
-        CliqueVertex cv = *vi;
-        u32 id = g[cv].stateId;
+        assert(!g.empty());
+        u32 id = g[0];
+        CliqueVertex &n = vertexMap.at(id);
 
-        // corresponding vertex in the original graph
-        CliqueVertex n = vertexMap.at(id);
-
-        vector<CliqueVertex> neighbor;
-        vector<CliqueVertex> nonneighbor;
-        getNeighborInfo(g, neighbor, nonneighbor, cv);
-
-        if (foundVertexId.find(id) != foundVertexId.end()) {
+        vector<u32> neighbor;
+        vector<u32> nonneighbor;
+        set<u32> subgraphId(g.begin(), g.end());
+        getNeighborInfo(cg, neighbor, nonneighbor, n, subgraphId);
+        if (contains(foundVertexId, id)) {
             prevId = id;
-            // get graph consisting of non-neighbors for right branch
-            if (!cg[n].rightChildVisited) {
-                gStack.push(g);
+            // get non-neighbors for right branch
+            if (visitedId.insert(id).second) {
+                DEBUG_PRINTF("right branch\n");
                 if (!nonneighbor.empty()) {
-                    const CliqueGraph &nSub = createSubgraph(g, nonneighbor);
-                    gStack.push(nSub);
+                    gStack.push(nonneighbor);
                     leftChild = false;
                 }
-                cg[n].rightChildVisited = true;
-            } else if (id != startId) {
-                // both the left and right branches are visited,
-                // update its parent's clique and independent sets
-                u32 parentId = cg[n].parentId;
-                CliqueVertex parent = vertexMap.at(parentId);
-                if (cg[n].leftChild) {
-                    updateCliqueInfo(cg, n, cg[parent].clique1,
-                        cg[parent].indepSet1);
-                } else {
-                    updateCliqueInfo(cg, n, cg[parent].clique2,
-                        cg[parent].indepSet2);
+            } else {
+                if (id != startId) {
+                    // both the left and right branches are visited,
+                    // update its parent's clique and independent sets
+                    u32 parentId = cg[n].parentId;
+                    CliqueVertex &parent = vertexMap.at(parentId);
+                    if (cg[n].leftChild) {
+                        updateCliqueInfo(cg, n, cg[parent].clique1,
+                                         cg[parent].indepSet1);
+                    } else {
+                        updateCliqueInfo(cg, n, cg[parent].clique2,
+                                         cg[parent].indepSet2);
+                    }
                 }
+                gStack.pop();
             }
         } else {
             foundVertexId.insert(id);
-            g[n].leftChild = leftChild;
-            g[n].parentId = prevId;
-            gStack.push(g);
-            // get graph consisting of neighbors for left branch
+            cg[n].leftChild = leftChild;
+            cg[n].parentId = prevId;
+            cg[n].clique1.clear();
+            cg[n].clique2.clear();
+            cg[n].indepSet1.clear();
+            cg[n].indepSet2.clear();
+            // get neighbors for left branch
             if (!neighbor.empty()) {
-                const CliqueGraph &sub = createSubgraph(g, neighbor);
-                gStack.push(sub);
+                gStack.push(neighbor);
                 leftChild = true;
             }
             prevId = id;
@@ -351,12 +327,12 @@ vector<u32> removeClique(CliqueGraph &cg) {
     while (!graph_empty(cg)) {
         const vector<u32> &c = cliquesVec.back();
         vector<CliqueVertex> dead;
-        for (auto v : vertices_range(cg)) {
+        for (const auto &v : vertices_range(cg)) {
             if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
                 dead.push_back(v);
             }
         }
-        for (auto v : dead) {
+        for (const auto &v : dead) {
             clear_vertex(v, cg);
             remove_vertex(v, cg);
         }

From ae7dbc2472cc1d826dcbcf5cdb5a56a7187e619f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 2 Nov 2015 14:41:17 +1100
Subject: [PATCH 15/54] repeatRecurTable: no need for u64a return type

---
 src/nfa/repeatcompile.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 25e57581..2f187503 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -75,8 +75,8 @@ u32 calcPackedBytes(u64a val) {
 }
 
 static
-u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
-                      const u32 minPeriod) {
+u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
+                     const u32 minPeriod) {
     u32 repeatTmp = info->patchCount > 2 ? 64 : (u32)repeatMax;
     u32 repeat_index = repeatTmp < minPeriod ? repeatTmp : minPeriod;
     for (u32 i = 0; i <= repeat_index; i++) {
@@ -93,7 +93,7 @@ u64a repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
 
 static
 u32 findOptimalPatchSize(struct RepeatStateInfo *info, const depth &repeatMax,
-                         const u32 minPeriod, u64a rv) {
+                         const u32 minPeriod, u32 rv) {
     u32 cnt = 0;
     u32 patch_bits = 0;
     u32 total_size = 0;
@@ -171,7 +171,7 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
         assert(minPeriod);
         assert(repeatMax.is_finite());
         {
-            u64a rv = repeatRecurTable(this, repeatMax, minPeriod);
+            u32 rv = repeatRecurTable(this, repeatMax, minPeriod);
             u32 repeatTmp = 0;
             if ((u32)repeatMax < minPeriod) {
                 repeatTmp = repeatMax;

From a083bcfa8dd24792ba436a2818210c71ab3264ee Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 13:18:34 +1100
Subject: [PATCH 16/54] repeat: use u32 arithmetic explicitly

In some ring-based models, we know that if the ring is not stale, then
all our bounds should fit within 32-bits. This change makes these
explicitly u32 rather than implicitly narrowing later on.
---
 src/nfa/repeat.c         | 47 ++++++++++++++++++++++++----------------
 unit/internal/repeat.cpp |  6 +++--
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index f84eed71..0344423b 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -39,6 +39,8 @@
 #include "util/pack_bits.h"
 #include "util/partial_store.h"
 #include "util/unaligned.h"
+
+#include <stdint.h>
 #include <string.h>
 
 /** \brief Returns the total capacity of the ring.
@@ -709,12 +711,7 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
     dumpRing(info, xs, ring);
 #endif
 
-    // We work in terms of the distance between the current offset and the base
-    // offset in our history.
-    u64a delta = offset - xs->offset;
-    DEBUG_PRINTF("delta=%llu\n", delta);
-
-    if (delta < info->repeatMin) {
+    if (offset - xs->offset < info->repeatMin) {
         DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n");
         return REPEAT_NOMATCH;
     }
@@ -724,17 +721,22 @@ enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info,
         return REPEAT_STALE;
     }
 
+    // If we're not stale, delta fits in the range [repeatMin, lastTop +
+    // repeatMax], which fits in a u32.
+    assert(offset - xs->offset < UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    DEBUG_PRINTF("delta=%u\n", delta);
+
     // Find the bounds on possible matches in the ring buffer.
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin + 1;
-    upper = MIN(upper, ringOccupancy(xs, ringSize));
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize));
 
     if (lower >= upper) {
         DEBUG_PRINTF("no matches to check\n");
         return REPEAT_NOMATCH;
     }
 
-    DEBUG_PRINTF("possible match indices=[%llu,%llu]\n", lower, upper);
+    DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper);
     if (ringHasMatch(xs, ring, ringSize, lower, upper)) {
         return REPEAT_MATCH;
     }
@@ -1252,17 +1254,19 @@ u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
     } else if (nextOffset >
                repeatLastTopSparseOptimalP(info, ctrl, state) +
                info->repeatMax) {
+        DEBUG_PRINTF("ring is stale\n");
         return 0;
     } else {
-        u64a delta = nextOffset - xs->offset;
-        u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+        assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale
+        u32 delta = (u32)(nextOffset - xs->offset);
+        u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
         patch = lower / patch_size;
         tval = lower - patch * patch_size;
     }
 
     DEBUG_PRINTF("patch %u\n", patch);
     u32 patch_count = info->patchCount;
-    if (patch >= patch_count){
+    if (patch >= patch_count) {
         return 0;
     }
 
@@ -1480,7 +1484,7 @@ char sparseHasMatch(const struct RepeatInfo *info, const u8 *state,
 enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
                                               const union RepeatControl *ctrl,
                                               const void *state, u64a offset) {
-    DEBUG_PRINTF("check for match at %llu corresponding to trigger"
+    DEBUG_PRINTF("check for match at %llu corresponding to trigger "
                  "at [%llu, %llu]\n", offset, offset - info->repeatMax,
                  offset - info->repeatMin);
 
@@ -1498,15 +1502,20 @@ enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
         return REPEAT_STALE;
     }
 
-    u64a delta = offset - xs->offset;
-    u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
-    u64a upper = delta - info->repeatMin;
+    // Our delta between the base offset of the ring and the current offset
+    // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This
+    // range fits comfortably within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+
+    u32 delta = (u32)(offset - xs->offset);
     u32 patch_size = info->patchSize;
     u32 patch_count = info->patchCount;
     u32 occ = ringOccupancy(xs, patch_count);
-    upper = MIN(upper, occ * patch_size - 1);
 
-    DEBUG_PRINTF("lower=%llu, upper=%llu\n", lower, upper);
+    u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0;
+    u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1);
+
+    DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper);
     u32 patch_lower = lower / patch_size;
     u32 patch_upper = upper / patch_size;
 
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index 95408f28..db797d24 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -505,6 +505,7 @@ const RepeatTestInfo sparseRepeats[] = {
     { REPEAT_SPARSE_OPTIMAL_P, 4000, 4000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4500, 4500 },
     { REPEAT_SPARSE_OPTIMAL_P, 5000, 5000 },
+    { REPEAT_SPARSE_OPTIMAL_P, 65534, 65534 },
     // {N, M} repeats
     { REPEAT_SPARSE_OPTIMAL_P, 10, 20 },
     { REPEAT_SPARSE_OPTIMAL_P, 20, 40 },
@@ -528,7 +529,8 @@ const RepeatTestInfo sparseRepeats[] = {
     { REPEAT_SPARSE_OPTIMAL_P, 3500, 4000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4000, 8000 },
     { REPEAT_SPARSE_OPTIMAL_P, 4500, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 }
+    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 },
+    { REPEAT_SPARSE_OPTIMAL_P, 60000, 65534 }
 };
 
 static
@@ -802,7 +804,7 @@ TEST_P(SparseOptimalTest, Simple1) {
                                   1000 + info->repeatMax * 2));
     ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state,
                                   1000 + info->repeatMax * 2 + 1));
-    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 10000));
+    ASSERT_EQ(0U, repeatNextMatch(info, ctrl, state, 100000));
 }
 
 TEST_P(SparseOptimalTest, TwoTopsNeg) {

From 2603be3924f155b851851e2076e1cd52a37f99e5 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 14:58:01 +1100
Subject: [PATCH 17/54] storeInitialRingTopPatch: fix large delta bug

Check for staleness up front, so that it is safe to use u32 values to
handle adding more tops.

Adds LargeGap unit tests.
---
 src/nfa/repeat.c         | 38 +++++++++++++++++++++++---------------
 unit/internal/repeat.cpp | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index 0344423b..59205ae4 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -1165,7 +1165,7 @@ static
 void storeInitialRingTopPatch(const struct RepeatInfo *info,
                               struct RepeatRingControl *xs,
                               u8 *state, u64a offset) {
-    DEBUG_PRINTF("set the first patch\n");
+    DEBUG_PRINTF("set the first patch, offset=%llu\n", offset);
     xs->offset = offset;
 
     u8 *active = state;
@@ -1340,21 +1340,33 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                                union RepeatControl *ctrl, void *state,
                                u64a offset, char is_alive) {
     struct RepeatRingControl *xs = &ctrl->ring;
-
-    u64a delta = offset - xs->offset;
-    u32 patch_size = info->patchSize;
-    u32 patch_count = info->patchCount;
-    u32 encoding_size = info->encodingSize;
-    u32 patch = delta / patch_size;
-    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset, encoding_size);
-
     u8 *active = (u8 *)state;
-    if (!is_alive) {
+
+    DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset,
+                 info->encodingSize);
+
+    // If (a) this is the first top, or (b) the ring is stale, initialize the
+    // ring and write this offset in as the first top.
+    if (!is_alive ||
+        offset >
+            repeatLastTopSparseOptimalP(info, ctrl, state) + info->repeatMax) {
         storeInitialRingTopPatch(info, xs, active, offset);
         return;
     }
 
-    assert(offset >= xs->offset);
+    // Tops should arrive in order, with no duplicates.
+    assert(offset > repeatLastTopSparseOptimalP(info, ctrl, state));
+
+    // As the ring is not stale, our delta should fit within a u32.
+    assert(offset - xs->offset <= UINT32_MAX);
+    u32 delta = (u32)(offset - xs->offset);
+    u32 patch_size = info->patchSize;
+    u32 patch_count = info->patchCount;
+    u32 encoding_size = info->encodingSize;
+    u32 patch = delta / patch_size;
+
+    DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size,
+                 patch);
 
     u8 *ring = active + info->patchesOffset;
     u32 occ = ringOccupancy(xs, patch_count);
@@ -1365,10 +1377,6 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
                  patch, patch_count, occ);
     if (patch >= patch_count) {
         u32 patch_shift_count = patch - patch_count + 1;
-        if (patch_shift_count >= patch_count) {
-            storeInitialRingTopPatch(info, xs, active, offset);
-            return;
-        }
         assert(patch >= patch_shift_count);
         DEBUG_PRINTF("shifting by %u\n", patch_shift_count);
         xs->offset += patch_size * patch_shift_count;
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index db797d24..94f1bdc1 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -448,6 +448,25 @@ TEST_P(RepeatTest, Pack) {
     }
 }
 
+TEST_P(RepeatTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << info);
+
+    if (info.repeatMax == REPEAT_INF) {
+        return; // Test not valid for FIRST-type repeats.
+    }
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(&info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(&info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        repeatStore(&info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(&info, ctrl, state));
+    }
+}
+
 static
 const u32 sparsePeriods[] = {
     2,
@@ -895,6 +914,24 @@ TEST_P(SparseOptimalTest, Simple3e) {
     test_sparse3entryExpire(info, ctrl, state, 2 * info->minPeriod - 1);
 }
 
+TEST_P(SparseOptimalTest, LargeGap) {
+    SCOPED_TRACE(testing::Message() << "Repeat: " << *info);
+
+    for (int i = 0; i < 64; i++) {
+        u64a top1 = 1000;
+        repeatStore(info, ctrl, state, top1, 0); // first top
+        ASSERT_EQ(top1, repeatLastTop(info, ctrl, state));
+
+        // Add a second top after a gap of 2^i bytes.
+        u64a top2 = top1 + (1ULL << i);
+        if (top2 - top1 < info->minPeriod) {
+            continue; // not a valid top
+        }
+        repeatStore(info, ctrl, state, top2, 1); // second top
+        ASSERT_EQ(top2, repeatLastTop(info, ctrl, state));
+    }
+}
+
 TEST_P(SparseOptimalTest, ThreeTops) {
     SCOPED_TRACE(testing::Message() << "Repeat: " << *info);
 

From 46ad39f253a4c604d867337206501a4c74dd6dd6 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 15:17:17 +1100
Subject: [PATCH 18/54] Add inlined sparseLastTop

This allows the code to be inlined into other sparse optimal repeat
functions.
---
 src/nfa/repeat.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index 59205ae4..92b61874 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -1199,12 +1199,10 @@ u32 getSparseOptimalTargetValue(const struct RepeatInfo *info,
     return loc;
 }
 
-u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
-                                 const union RepeatControl *ctrl,
-                                 const void *state) {
+static
+u64a sparseLastTop(const struct RepeatInfo *info,
+                   const struct RepeatRingControl *xs, const u8 *state) {
     DEBUG_PRINTF("looking for last top\n");
-    const struct RepeatRingControl *xs = &ctrl->ring;
-
     u32 patch_size = info->patchSize;
     u32 patch_count = info->patchCount;
     u32 encoding_size = info->encodingSize;
@@ -1216,7 +1214,7 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
     }
 
     DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ);
-    const u8 *ring = (const u8 *)state + info->patchesOffset;
+    const u8 *ring = state + info->patchesOffset;
     u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size);
 
     DEBUG_PRINTF("val:%llu\n", val);
@@ -1233,6 +1231,12 @@ u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
     return 0;
 }
 
+u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info,
+                                 const union RepeatControl *ctrl,
+                                 const void *state) {
+    return sparseLastTop(info, &ctrl->ring, state);
+}
+
 u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
                                    const union RepeatControl *ctrl,
                                    const void *state, u64a offset) {
@@ -1251,9 +1255,7 @@ u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info,
     if (nextOffset <= xs->offset + info->repeatMin) {
         patch = xs->first;
         tval = 0;
-    } else if (nextOffset >
-               repeatLastTopSparseOptimalP(info, ctrl, state) +
-               info->repeatMax) {
+    } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) {
         DEBUG_PRINTF("ring is stale\n");
         return 0;
     } else {
@@ -1348,14 +1350,13 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
     // If (a) this is the first top, or (b) the ring is stale, initialize the
     // ring and write this offset in as the first top.
     if (!is_alive ||
-        offset >
-            repeatLastTopSparseOptimalP(info, ctrl, state) + info->repeatMax) {
+        offset > sparseLastTop(info, xs, state) + info->repeatMax) {
         storeInitialRingTopPatch(info, xs, active, offset);
         return;
     }
 
     // Tops should arrive in order, with no duplicates.
-    assert(offset > repeatLastTopSparseOptimalP(info, ctrl, state));
+    assert(offset > sparseLastTop(info, xs, state));
 
     // As the ring is not stale, our delta should fit within a u32.
     assert(offset - xs->offset <= UINT32_MAX);
@@ -1504,8 +1505,7 @@ enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info,
     if (offset < xs->offset + info->repeatMin) {
         DEBUG_PRINTF("too soon\n");
         return REPEAT_NOMATCH;
-    } else if (offset > repeatLastTopSparseOptimalP(info, ctrl, state) +
-                        info->repeatMax) {
+    } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) {
         DEBUG_PRINTF("stale\n");
         return REPEAT_STALE;
     }

From 4c53bd46413cd54f0c85863addf675edfca88f9b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:11:56 +1100
Subject: [PATCH 19/54] parser: use 'override' keyword in subclasses

---
 src/parser/check_refs.cpp       | 2 +-
 src/parser/prefilter.cpp        | 2 +-
 src/parser/shortcut_literal.cpp | 2 +-
 src/parser/unsupported.cpp      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/parser/check_refs.cpp b/src/parser/check_refs.cpp
index 6307a06a..ad81ae76 100644
--- a/src/parser/check_refs.cpp
+++ b/src/parser/check_refs.cpp
@@ -57,7 +57,7 @@ public:
     ReferenceVisitor(size_t num_groups, const flat_set<string> &targets)
         : num_ids(num_groups), names(targets) {}
 
-    ~ReferenceVisitor();
+    ~ReferenceVisitor() override;
 
     void invalid_index(const char *component, unsigned id) {
         assert(component);
diff --git a/src/parser/prefilter.cpp b/src/parser/prefilter.cpp
index de99d9f1..f5a0c66c 100644
--- a/src/parser/prefilter.cpp
+++ b/src/parser/prefilter.cpp
@@ -201,7 +201,7 @@ const ComponentSequence *findCapturingGroup(const Component *root,
 class PrefilterVisitor : public DefaultComponentVisitor {
 public:
     PrefilterVisitor(Component *c, const ParseMode &m) : root(c), mode(m) {}
-    ~PrefilterVisitor();
+    ~PrefilterVisitor() override;
 
     /** \brief Calls the visitor (recursively) on a new replacement component
      * we've just created. Takes care of freeing it if the sequence is itself
diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp
index d2b77dc5..f6f5d383 100644
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@@ -64,7 +64,7 @@ namespace ue2 {
  */
 class ConstructLiteralVisitor : public ConstComponentVisitor {
 public:
-    ~ConstructLiteralVisitor();
+    ~ConstructLiteralVisitor() override;
 
     /** \brief Thrown if this component does not represent a literal. */
     struct NotLiteral {};
diff --git a/src/parser/unsupported.cpp b/src/parser/unsupported.cpp
index b0b15d8d..c97a5750 100644
--- a/src/parser/unsupported.cpp
+++ b/src/parser/unsupported.cpp
@@ -44,7 +44,7 @@ namespace ue2 {
  * an unsupported component. */
 class UnsupportedVisitor : public DefaultConstComponentVisitor {
 public:
-    ~UnsupportedVisitor();
+    ~UnsupportedVisitor() override;
     void pre(const ComponentAssertion &) override {
         throw ParseError("Zero-width assertions are not supported.");
     }

From 5805ac193c5c20c32930d5d4fd5c62bbbcb8ec7b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:12:36 +1100
Subject: [PATCH 20/54] NGWrapper: mark dtor with override

---
 src/nfagraph/ng.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index fe78254c..52353da9 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -64,7 +64,7 @@ public:
               bool prefilter, const som_type som, ReportID rid, u64a min_offset,
               u64a max_offset, u64a min_length);
 
-    ~NGWrapper();
+    ~NGWrapper() override;
 
     /** index of the expression represented by this graph, used
      * - down the track in error handling

From fb834114e573602daead103d490f888e2899f566 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:13:12 +1100
Subject: [PATCH 21/54] limex_dump: use 'override' keyword in subclass

---
 src/nfa/limex_dump.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index b836b63b..084f35dd 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -317,7 +317,7 @@ template<typename limex_type>
 struct limex_labeller : public nfa_labeller {
     explicit limex_labeller(const limex_type *limex_in) : limex(limex_in) {}
 
-    void label_state(FILE *f, u32 state) const {
+    void label_state(FILE *f, u32 state) const override {
         const typename limex_traits<limex_type>::exception_type *exceptions
             = getExceptionTable(limex);
         if (!testbit((const u8 *)&limex->exceptionMask,

From ed4a3cdcf102820367117897af09c1d510d20765 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:19:47 +1100
Subject: [PATCH 22/54] compare: always use braces for for/if blocks

---
 src/util/compare.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/util/compare.h b/src/util/compare.h
index b7237ec2..11c01f08 100644
--- a/src/util/compare.h
+++ b/src/util/compare.h
@@ -98,18 +98,22 @@ u64a theirtoupper64(const u64a x) {
 static really_inline
 int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) {
     const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (mytolower(*p1) != mytolower(*p2))
+    for (; p1 < pEnd; p1++, p2++) {
+        if (mytolower(*p1) != mytolower(*p2)) {
             return 1;
+        }
+    }
     return 0;
 }
 
 static really_inline
 int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) {
     const u8 *pEnd = (const u8 *)p1 + len;
-    for (; p1 < pEnd; p1++, p2++)
-        if (*p1 != *p2)
+    for (; p1 < pEnd; p1++, p2++) {
+        if (*p1 != *p2) {
             return 1;
+        }
+    }
     return 0;
 }
 

From 51c80200399966fa9c1f0ad6b629071a2d1ec636 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:21:56 +1100
Subject: [PATCH 23/54] simplegrep: use correct sign in printf format

---
 examples/simplegrep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/simplegrep.c b/examples/simplegrep.c
index 2fe6e6f3..9e392a8f 100644
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@@ -109,7 +109,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
      * limit the size of our buffer appropriately. */
     if ((unsigned long)dataLen > UINT_MAX) {
         dataLen = UINT_MAX;
-        printf("WARNING: clipping data to %lu bytes\n", dataLen);
+        printf("WARNING: clipping data to %ld bytes\n", dataLen);
     } else if (dataLen == 0) {
         fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
         fclose(f);
@@ -118,7 +118,7 @@ static char *readInputData(const char *inputFN, unsigned int *length) {
 
     char *inputData = malloc(dataLen);
     if (!inputData) {
-        fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
+        fprintf(stderr, "ERROR: unable to malloc %ld bytes\n", dataLen);
         fclose(f);
         return NULL;
     }

From 863ea1b2b26603527bb581a6f12d6ce2a7d20939 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:22:39 +1100
Subject: [PATCH 24/54] mpv_dump: correct hex escapes in printf format

---
 src/nfa/mpv_dump.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index b6e702ac..e731df87 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -70,9 +70,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
         break;
     case MPV_VERM:
         if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "verm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "verm 0x%02x\n", k->u.verm.c);
         } else {
-            fprintf(f, "verm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "verm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
         }
         break;
     case MPV_SHUFTI:
@@ -87,9 +87,9 @@ void dumpKilo(FILE *f, const mpv *m, const mpv_kilopuff *k) {
         break;
     case MPV_NVERM:
         if (!ourisprint(k->u.verm.c)) {
-            fprintf(f, "nverm 0x%hhu\n", k->u.verm.c);
+            fprintf(f, "nverm 0x%02x\n", k->u.verm.c);
         } else {
-            fprintf(f, "nverm 0x%hhu '%c'\n", k->u.verm.c, k->u.verm.c);
+            fprintf(f, "nverm 0x%02x '%c'\n", k->u.verm.c, k->u.verm.c);
         }
         break;
     default:

From 62776b615b32e7c8101fd41a0aa13928bdd7839c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:23:27 +1100
Subject: [PATCH 25/54] nfa_api_queue: debug printf format fix

---
 src/nfa/nfa_api_queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index 75c15cce..59c18fca 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -277,7 +277,7 @@ void debugQueue(const struct mq *q) {
             type = "MQE_TOP_N";
             break;
         }
-        DEBUG_PRINTF("\tq[%u] %lld %d:%s\n", cur, q->items[cur].location,
+        DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location,
                      q->items[cur].type, type);
     }
 }

From d9efe071254acd762f12598f16c4a76a71082288 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 Nov 2015 16:24:06 +1100
Subject: [PATCH 26/54] depth: correct sign in printf format

---
 src/util/depth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/depth.h b/src/util/depth.h
index 29ff2819..977fd0c3 100644
--- a/src/util/depth.h
+++ b/src/util/depth.h
@@ -183,7 +183,7 @@ public:
 
         s64a rv = val + d;
         if (rv < 0 || (u64a)rv >= val_infinity) {
-            DEBUG_PRINTF("depth %llu too large to represent!\n", rv);
+            DEBUG_PRINTF("depth %lld too large to represent!\n", rv);
             throw DepthOverflowError();
         }
 

From 9a7b912a5dae8b15de3b932405713733eb22d23b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 9 Nov 2015 10:37:20 +1100
Subject: [PATCH 27/54] Rework parser rejection for POSIX collating elems

Implement rejection of POSIX collating elements ("[.ch.]" and "[=ch=]"
entirely in the Ragel parser, using the same approach both inside and
ouside character classes.

Fix buggy rejection of [^.ch.], which we should accept as a character
class.
---
 src/parser/Parser.rl            | 34 +++++++++++++--------------------
 unit/hyperscan/bad_patterns.txt |  4 ++++
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 1481b7d8..37beb765 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -790,10 +790,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
         any => { throw LocatedParseError("Unknown property"); };
                      *|;
     charClassGuts := |*
-              # We don't like POSIX collating elements (neither does PCRE or Perl).
-              '\[\.' [^\]]* '\.\]' | 
-              '\[=' [^\]]* '=\]' => {
-                  throw LocatedParseError("Unsupported POSIX collating element");
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
               };
               # Named sets
               # Adding these may cause the charclass to close, hence the
@@ -1090,23 +1092,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throwInvalidUtf8();
               };
 
-              # dot or equals at the end of a character class could be the end
-              # of a collating element, like [.blah.] or [=blah=].
-              [.=] ']' => {
-                  if (currentCls->getFirstChar() == *ts) {
-                      assert(currentClsBegin);
-                      ostringstream oss;
-                      oss << "Unsupported POSIX collating element at index "
-                          << currentClsBegin - ptr << ".";
-                      throw ParseError(oss.str());
-                  }
-                  currentCls->add(*ts);
-                  currentCls->finalize();
-                  currentSeq->addComponent(move(currentCls));
-                  inCharClass = false;
-                  fgoto main;
-              };
-
               # Literal character
               (any - ']') => {
                   if (currentCls->class_empty()) {
@@ -1232,6 +1217,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throw LocatedParseError("POSIX named classes are only "
                                           "supported inside a class");
               };
+              # We don't support POSIX collating elements (neither does PCRE
+              # or Perl). These look like [.ch.] or [=ch=].
+              '\[\.' ( '\\]' | [^\]] )* '\.\]' |
+              '\[=' ( '\\]' | [^\]] )* '=\]' => {
+                  throw LocatedParseError("Unsupported POSIX collating "
+                                          "element");
+              };
               # Begin eating characters for class
               '\[' => eatClass;
               # Begin quoted literal
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 1ad445b3..837ba871 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -128,3 +128,7 @@
 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/���������������������������0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
 129:/bignum \1111111111111111111/ #Number is too big at index 7.
 130:/foo|&{5555555,}/ #Bounded repeat is too large.
+131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
+132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
+133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
+134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.

From b1f6a539c7e23771933be57ac97a28f5b458f3aa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 9 Nov 2015 10:49:19 +1100
Subject: [PATCH 28/54] Remove dead ComponentClass::{get,set}FirstChar

---
 src/parser/ComponentClass.cpp |  2 +-
 src/parser/ComponentClass.h   | 11 +----------
 src/parser/Parser.rl          |  3 ---
 3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp
index d90d7db2..4570734d 100644
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@@ -420,7 +420,7 @@ unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
 
 ComponentClass::ComponentClass(const ParseMode &mode_in)
     : m_negate(false), mode(mode_in), in_cand_range(false),
-      range_start(INVALID_UNICODE), finalized(false), firstChar('\0') {}
+      range_start(INVALID_UNICODE), finalized(false) {}
 
 ComponentClass::~ComponentClass() { }
 
diff --git a/src/parser/ComponentClass.h b/src/parser/ComponentClass.h
index 21b51202..ce3b49f3 100644
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@@ -245,9 +245,6 @@ public:
 
     bool isNegated() const { return m_negate; }
 
-    void setFirstChar(char c) { firstChar = c; }
-    char getFirstChar() const { return firstChar; }
-
     std::vector<PositionInfo> first() const override = 0;
     std::vector<PositionInfo> last() const override = 0;
     bool empty() const override { return false; } /* always 1 codepoint wide */
@@ -263,19 +260,13 @@ protected:
     unichar range_start;
     bool finalized;
 
-    /** Literal character at the start of this character class, e.g. '.' for
-     * the class [.abc]. Used to identify (unsupported) POSIX collating
-     * elements. */
-    char firstChar;
-
     virtual void createRange(unichar) = 0;
 
     // Protected copy ctor. Use clone instead.
     ComponentClass(const ComponentClass &other)
         : Component(other), m_negate(other.m_negate), mode(other.mode),
           in_cand_range(other.in_cand_range), range_start(other.range_start),
-          finalized(other.finalized),
-          firstChar(other.firstChar) {}
+          finalized(other.finalized) {}
 };
 
 } // namespace ue2
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 37beb765..a0378dce 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -1094,9 +1094,6 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
 
               # Literal character
               (any - ']') => {
-                  if (currentCls->class_empty()) {
-                      currentCls->setFirstChar(*ts);
-                  }
                   currentCls->add(*ts);
               };
 

From c68bfe05d860265887101c4196f1c02d15d506b6 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 9 Nov 2015 12:50:52 +1100
Subject: [PATCH 29/54] Don't use class_empty in early class parsing

Instead, explicitly track whether we're still in the early class parsing
machine.
---
 src/parser/Parser.rl | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index a0378dce..96656875 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -424,6 +424,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
         assert(!inCharClass); // not reentrant
         currentCls = getComponentClass(mode);
         inCharClass = true;
+        inCharClassEarly = true;
         currentClsBegin = ts;
         fgoto readClass;
     }
@@ -474,6 +475,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
     }
     action is_utf8 { mode.utf8 }
     action is_ignore_space { mode.ignore_space }
+    action is_early_charclass { inCharClassEarly }
 
     action addNumberedBackRef {
         if (accumulator == 0) {
@@ -1109,25 +1111,24 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
     # Parser to read stuff from a character class
     #############################################################
     readClass := |*
-        # the negate and right bracket out the front are special
-        '\^' => {
+        # A caret at the beginning of the class means that the rest of the
+        # class is negated.
+        '\^' when is_early_charclass => {
             if (currentCls->isNegated()) {
+                // Already seen a caret; the second one is not a meta-character.
+                inCharClassEarly = false;
                 fhold; fgoto charClassGuts;
             } else {
                 currentCls->negate();
+                // Note: we cannot switch off inCharClassEarly here, as /[^]]/
+                // needs to use the right square bracket path below.
             }
         };
-        ']' => {
-            // if this is the first thing in the class, add it and move along,
-            // otherwise jump into the char class machine to handle what might
-            // end up as fail
-            if (currentCls->class_empty()) {
-                currentCls->add(']');
-            } else {
-                // leave it for the next machine
-                fhold;
-            }
-            fgoto charClassGuts;
+        # A right square bracket before anything "real" is interpreted as a
+        # literal right square bracket.
+        ']' when is_early_charclass => {
+            currentCls->add(']');
+            inCharClassEarly = false;
         };
         # if we hit a quote before anything "real", handle it
         #'\\Q' => { fcall readQuotedClass; };
@@ -1137,7 +1138,11 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
         '\\E' => { /*noop*/};
 
         # time for the real work to happen
-        any => { fhold; fgoto charClassGuts; };
+        any => {
+            inCharClassEarly = false;
+            fhold;
+            fgoto charClassGuts;
+        };
         *|;
 
     #############################################################
@@ -1885,6 +1890,11 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
     // brackets [..].
     bool inCharClass = false;
 
+    // True if the machine is inside a character class but it has not processed
+    // any "real" elements yet, i.e. it's still processing meta-characters like
+    // '^'.
+    bool inCharClassEarly = false;
+
     // Location at which the current character class began.
     const u8 *currentClsBegin = p;
 

From 9cffa7666ffda10b2c506c926adf94c11dc8fb44 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 9 Nov 2015 12:59:36 +1100
Subject: [PATCH 30/54] Refine ComponentClass::class_empty

ComponentClass::class_empty should only be used on finalized classes to
determine whether a given class contains any elements; it should not
take the cr_ucp or cps_ucp into account, as they have been folden in by
the finalize call.

Fixes our failure to identify that the pattern /[^\D\d]/8W can never
match.
---
 src/parser/AsciiComponentClass.cpp | 3 ++-
 src/parser/ComponentClass.cpp      | 1 -
 src/parser/ComponentClass.h        | 8 ++++++--
 src/parser/Utf8ComponentClass.cpp  | 3 ++-
 unit/hyperscan/bad_patterns.txt    | 1 +
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/parser/AsciiComponentClass.cpp b/src/parser/AsciiComponentClass.cpp
index de7a40d4..44ecb5bb 100644
--- a/src/parser/AsciiComponentClass.cpp
+++ b/src/parser/AsciiComponentClass.cpp
@@ -52,7 +52,8 @@ AsciiComponentClass *AsciiComponentClass::clone() const {
 }
 
 bool AsciiComponentClass::class_empty(void) const {
-    return cr.none() && cr_ucp.none();
+    assert(finalized);
+    return cr.none();
 }
 
 void AsciiComponentClass::createRange(unichar to) {
diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp
index 4570734d..43c05898 100644
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@@ -441,7 +441,6 @@ void ComponentClass::addDash(void) {
 }
 
 void ComponentClass::negate() {
-    assert(class_empty());
     m_negate = true;
 }
 
diff --git a/src/parser/ComponentClass.h b/src/parser/ComponentClass.h
index ce3b49f3..1cb1a7d0 100644
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@@ -232,8 +232,12 @@ public:
     Component *accept(ComponentVisitor &v) override = 0;
     void accept(ConstComponentVisitor &v) const override = 0;
 
-     /** True iff we have already started adding members to the class. This is
-      * a different concept to Component::empty */
+    /** \brief True if the class contains no members (i.e. it will not match
+     * against anything). This function can only be called on a finalized
+     * class.
+     *
+     * Note: This is a different concept to Component::empty.
+     */
     virtual bool class_empty(void) const = 0;
 
     virtual void add(PredefinedClass c, bool negated) = 0;
diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index 72fc2146..3a6a85a4 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -484,7 +484,8 @@ UTF8ComponentClass *UTF8ComponentClass::clone() const {
 }
 
 bool UTF8ComponentClass::class_empty(void) const {
-    return cps.none() && cps_ucp.none();
+    assert(finalized);
+    return cps.none();
 }
 
 void UTF8ComponentClass::createRange(unichar to) {
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 837ba871..3b13e064 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -132,3 +132,4 @@
 132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
 133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
 134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
+135:/[^\D\d]/8W #Pattern can never match.

From b9d3b73ab8264aaa6cf2dd66a857afc84731177c Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Thu, 5 Nov 2015 14:46:07 +1100
Subject: [PATCH 31/54] Fix includes to meet our usual guidelines

---
 unit/internal/nfagraph_find_matches.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unit/internal/nfagraph_find_matches.cpp b/unit/internal/nfagraph_find_matches.cpp
index 41d28c52..553d6dc5 100644
--- a/unit/internal/nfagraph_find_matches.cpp
+++ b/unit/internal/nfagraph_find_matches.cpp
@@ -36,9 +36,9 @@
 #include "nfagraph/ng_builder.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_asserts.h"
-#include "util/target_info.h"
 #include "hs_compile.h"
-#include "ng_find_matches.h"
+#include "util/ng_find_matches.h"
+#include "util/target_info.h"
 
 using namespace std;
 using namespace testing;

From f65170da5b15a77ddd8fb8948c72ab7b00170f06 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Thu, 5 Nov 2015 14:49:04 +1100
Subject: [PATCH 32/54] cmake: improve build paths for nested builds

If Hyperscan is built as a subproject of another cmake project, it helps to
refer to PROJECT_xx_DIR instead of CMAKE_xx_DIR, etc.
---
 CMakeLists.txt         | 24 +++++++++++++++++-------
 src/fdr/CMakeLists.txt | 10 +++++-----
 unit/CMakeLists.txt    |  5 +++--
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67885c93..a1dfcaf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
 
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 INCLUDE (CheckFunctionExists)
@@ -56,8 +56,9 @@ if(CMAKE_GENERATOR STREQUAL Xcode)
     set(XCODE TRUE)
 endif()
 
-include_directories(src .)
-include_directories(${CMAKE_BINARY_DIR})
+set(CMAKE_INCLUDE_CURRENT_DIR 1)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)
 
 set(BOOST_USE_STATIC_LIBS OFF)
@@ -71,7 +72,7 @@ find_package(Boost ${BOOST_MINVERSION})
 if(NOT Boost_FOUND)
     # we might have boost in tree, so provide a hint and try again
     message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${CMAKE_SOURCE_DIR}/include")
+    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
     find_package(Boost ${BOOST_MINVERSION})
     if(NOT Boost_FOUND)
         message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
@@ -219,6 +220,15 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
 
+if (RELEASE_BUILD)
+    if (HAS_C_HIDDEN)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
+    endif()
+    if (HAS_CXX_HIDDEN)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
+    endif()
+endif()
+
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
@@ -327,8 +337,8 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
 endif()
 
 # do substitutions
-configure_file(${CMAKE_MODULE_PATH}/config.h.in ${CMAKE_BINARY_DIR}/config.h)
-configure_file(src/hs_version.h.in hs_version.h)
+configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
+configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
 
 if (PKG_CONFIG_FOUND)
     # we really only need to do this if we have pkg-config
@@ -345,7 +355,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 # include the autogen targets
 add_subdirectory(src/fdr)
 
-include_directories(${CMAKE_BINARY_DIR}/src/fdr)
+include_directories(${PROJECT_BINARY_DIR}/src/fdr)
 
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt
index 25396689..1436c3fc 100644
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@@ -27,11 +27,11 @@ fdr_autogen(teddy_runtime teddy_autogen.c)
 fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
 
 set(fdr_GENERATED_SRC
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
-${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
-PARENT_SCOPE)
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
+    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+    PARENT_SCOPE)
 
 set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index 9bc74e23..a8925a3c 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -7,7 +7,8 @@ if(NOT XCODE)
 else()
     set(CMAKE_CXX_FLAGS "-isystem ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CXX_FLAGS}")
 endif()
-include_directories(${CMAKE_SOURCE_DIR}/util)
+
+include_directories(${PROJECT_SOURCE_DIR})
 
 # remove some warnings
 # cmake's scope means these only apply here
@@ -26,7 +27,7 @@ endif()
 
 add_library(gtest ${gtest_SOURCES})
 
-add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${CMAKE_SOURCE_DIR})
+add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 
 if (NOT RELEASE_BUILD)
 set(unit_internal_SOURCES

From cf3ddd9e88e4ca27991f13a6da73f2b9e748057d Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 10 Nov 2015 16:18:42 +1100
Subject: [PATCH 33/54] repeatStoreSparseOptimalP: make diff a u32

As delta is a u32, we know diff will always fit within a u32 as well.
Silences a warning from Coverity.
---
 src/nfa/repeat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nfa/repeat.c b/src/nfa/repeat.c
index 92b61874..c1ff5162 100644
--- a/src/nfa/repeat.c
+++ b/src/nfa/repeat.c
@@ -1414,7 +1414,8 @@ void repeatStoreSparseOptimalP(const struct RepeatInfo *info,
         }
     }
 
-    u64a diff = delta - patch * patch_size;
+    assert((u64a)patch * patch_size <= delta);
+    u32 diff = delta - patch * patch_size;
     const u64a *repeatTable = getImplTable(info);
     val += repeatTable[diff];
 

From 2a2576e90784b473832058ff06c591118e346445 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 12 Nov 2015 15:27:11 +1100
Subject: [PATCH 34/54] Introduce copy_bytes for writing into bytecode

Protects memcpy from nullptr sources, which triggers failures in GCC's
UB sanitizer.
---
 src/nfa/goughcompile.cpp         | 11 +++------
 src/nfa/limex_compile.cpp        | 13 ++++------
 src/nfagraph/ng_lbr.cpp          |  5 ++--
 src/rose/rose_build_bytecode.cpp | 41 ++++++++++----------------------
 src/util/container.h             | 29 ++++++++++++++++++++++
 5 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index d2de7b95..d735c80a 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -1136,16 +1136,11 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     gough_dfa->length = gough_size;
 
     /* copy in blocks */
-    memcpy((u8 *)gough_dfa.get() + edge_prog_offset, &edge_blocks[0],
-           byte_length(edge_blocks));
+    copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
     if (top_prog_offset) {
-        memcpy((u8 *)gough_dfa.get() + top_prog_offset, &top_blocks[0],
-               byte_length(top_blocks));
-    }
-    if (!temp_blocks.empty()) {
-        memcpy((u8 *)gough_dfa.get() + prog_base_offset, &temp_blocks[0],
-               byte_length(temp_blocks));
+        copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
     }
+    copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);
 
     return gough_dfa;
 }
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 5cf46334..a6c34cb6 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1397,8 +1397,7 @@ struct Factory {
             repeat->horizon = rsi.horizon;
             repeat->packedCtrlSize = rsi.packedCtrlSize;
             repeat->stateSize = rsi.stateSize;
-            memcpy(repeat->packedFieldSizes, rsi.packedFieldSizes.data(),
-                   byte_length(rsi.packedFieldSizes));
+            copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
             repeat->patchCount = rsi.patchCount;
             repeat->patchSize = rsi.patchSize;
             repeat->encodingSize = rsi.encodingSize;
@@ -1413,8 +1412,7 @@ struct Factory {
             // Copy in the sparse lookup table.
             if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
                 assert(!rsi.table.empty());
-                memcpy(info_ptr + tableOffset, rsi.table.data(),
-                       byte_length(rsi.table));
+                copy_bytes(info_ptr + tableOffset, rsi.table);
             }
 
             // Fill the tug mask.
@@ -1702,6 +1700,7 @@ struct Factory {
 
         for (u32 i = 0; i < num_repeats; i++) {
             repeatOffsets[i] = offset;
+            assert(repeats[i].first);
             memcpy((char *)limex + offset, repeats[i].first.get(),
                    repeats[i].second);
             offset += repeats[i].second;
@@ -1709,8 +1708,7 @@ struct Factory {
 
         // Write repeat offset lookup table.
         assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
-        memcpy((char *)limex + repeatOffsetsOffset, repeatOffsets.data(),
-               byte_length(repeatOffsets));
+        copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
 
         limex->repeatOffset = repeatOffsetsOffset;
         limex->repeatCount = num_repeats;
@@ -1725,8 +1723,7 @@ struct Factory {
         limex->exReportOffset = exceptionReportsOffset;
         assert(ISALIGNED_N((char *)limex + exceptionReportsOffset,
                            alignof(ReportID)));
-        memcpy((char *)limex + exceptionReportsOffset, reports.data(),
-               byte_length(reports));
+        copy_bytes((char *)limex + exceptionReportsOffset, reports);
     }
 
     static
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index 11eded69..b9cacaa7 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -98,8 +98,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
     info->packedCtrlSize = rsi.packedCtrlSize;
     info->horizon = rsi.horizon;
     info->minPeriod = minPeriod;
-    memcpy(&info->packedFieldSizes, rsi.packedFieldSizes.data(),
-           byte_length(rsi.packedFieldSizes));
+    copy_bytes(&info->packedFieldSizes, rsi.packedFieldSizes);
     info->patchCount = rsi.patchCount;
     info->patchSize = rsi.patchSize;
     info->encodingSize = rsi.encodingSize;
@@ -122,7 +121,7 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
         nfa->length = verify_u32(len);
         info->length = verify_u32(sizeof(RepeatInfo)
                                   + sizeof(u64a) * (rsi.patchSize + 1));
-        memcpy(table, rsi.table.data(), byte_length(rsi.table));
+        copy_bytes(table, rsi.table);
     }
 }
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index bbc8644e..e17953aa 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2687,12 +2687,6 @@ void fillInReportInfo(RoseEngine *engine, u32 reportOffset,
                  sizeof(internal_report));
 }
 
-static
-void populateInvDkeyTable(char *ptr, const ReportManager &rm) {
-    vector<ReportID> table = rm.getDkeyToReportTable();
-    memcpy(ptr, table.data(), byte_length(table));
-}
-
 static
 bool hasSimpleReports(const vector<Report> &reports) {
     auto it = find_if(reports.begin(), reports.end(), isComplexReport);
@@ -4154,7 +4148,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->ekeyCount = rm.numEkeys();
     engine->dkeyCount = rm.numDkeys();
     engine->invDkeyOffset = dkeyOffset;
-    populateInvDkeyTable(ptr + dkeyOffset, rm);
+    copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());
 
     engine->somHorizon = ssm.somPrecision();
     engine->somLocationCount = ssm.numSomSlots();
@@ -4314,33 +4308,22 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     buildLitBenefits(*this, engine.get(), base_lits_benefits_offset);
 
     // Copy in other tables
-    memcpy(ptr + bc.engine_blob_base, bc.engine_blob.data(),
-           byte_length(bc.engine_blob));
-
-    memcpy(ptr + engine->literalOffset, literalTable.data(),
-           byte_length(literalTable));
-    memcpy(ptr + engine->roleOffset, bc.roleTable.data(),
-           byte_length(bc.roleTable));
-    copy(leftInfoTable.begin(), leftInfoTable.end(),
-         (LeftNfaInfo *)(ptr + engine->leftOffset));
+    copy_bytes(ptr + bc.engine_blob_base, bc.engine_blob);
+    copy_bytes(ptr + engine->literalOffset, literalTable);
+    copy_bytes(ptr + engine->roleOffset, bc.roleTable);
+    copy_bytes(ptr + engine->leftOffset, leftInfoTable);
 
     fillLookaroundTables(ptr + lookaroundTableOffset,
                          ptr + lookaroundReachOffset, bc.lookaround);
 
     fillInSomRevNfas(engine.get(), ssm, rev_nfa_table_offset, rev_nfa_offsets);
-    memcpy(ptr + engine->predOffset, predTable.data(), byte_length(predTable));
-    memcpy(ptr + engine->rootRoleOffset, rootRoleTable.data(),
-           byte_length(rootRoleTable));
-    memcpy(ptr + engine->anchoredReportMapOffset, art.data(), byte_length(art));
-    memcpy(ptr + engine->anchoredReportInverseMapOffset, arit.data(),
-           byte_length(arit));
-    memcpy(ptr + engine->multidirectOffset, mdr_reports.data(),
-           byte_length(mdr_reports));
-
-    copy(activeLeftIter.begin(), activeLeftIter.end(),
-         (mmbit_sparse_iter *)(ptr + engine->activeLeftIterOffset));
-
-    memcpy(ptr + engine->sideOffset, sideTable.data(), byte_length(sideTable));
+    copy_bytes(ptr + engine->predOffset, predTable);
+    copy_bytes(ptr + engine->rootRoleOffset, rootRoleTable);
+    copy_bytes(ptr + engine->anchoredReportMapOffset, art);
+    copy_bytes(ptr + engine->anchoredReportInverseMapOffset, arit);
+    copy_bytes(ptr + engine->multidirectOffset, mdr_reports);
+    copy_bytes(ptr + engine->activeLeftIterOffset, activeLeftIter);
+    copy_bytes(ptr + engine->sideOffset, sideTable);
 
     DEBUG_PRINTF("rose done %p\n", engine.get());
     return engine;
diff --git a/src/util/container.h b/src/util/container.h
index b4a10c89..62e841c1 100644
--- a/src/util/container.h
+++ b/src/util/container.h
@@ -33,8 +33,13 @@
 #ifndef UTIL_CONTAINER_H
 #define UTIL_CONTAINER_H
 
+#include "ue2common.h"
+
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 #include <set>
+#include <type_traits>
 #include <utility>
 
 namespace ue2 {
@@ -92,11 +97,35 @@ std::set<typename C::key_type> assoc_keys(const C &container) {
     return keys;
 }
 
+/**
+ * \brief Return the length in bytes of the given vector of (POD) objects.
+ */
 template<typename T>
 typename std::vector<T>::size_type byte_length(const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
     return vec.size() * sizeof(T);
 }
 
+/**
+ * \brief Copy the given vector of POD objects to the given location in memory.
+ * It is safe to give this function an empty vector.
+ */
+template<typename T>
+void *copy_bytes(void *dest, const std::vector<T> &vec) {
+    static_assert(std::is_pod<T>::value, "should be pod");
+    assert(dest);
+
+    // Since we're generally using this function to write into the bytecode,
+    // dest should be appropriately aligned for T.
+    assert(ISALIGNED_N(dest, alignof(T)));
+
+    if (vec.empty()) {
+        return dest; // Protect memcpy against null pointers.
+    }
+    assert(vec.data() != nullptr);
+    return std::memcpy(dest, vec.data(), byte_length(vec));
+}
+
 template<typename OrderedContainer1, typename OrderedContainer2>
 bool is_subset_of(const OrderedContainer1 &small, const OrderedContainer2 &big) {
     static_assert(std::is_same<typename OrderedContainer1::value_type,

From fd191680254fb3d9a19ffb9458f256ea71e0ad1c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 12 Nov 2015 13:27:55 +1100
Subject: [PATCH 35/54] Restore \Q..\E support in character classes

---
 src/parser/Parser.rl            | 12 +++---------
 unit/hyperscan/bad_patterns.txt |  1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 96656875..65cd7c1a 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -893,11 +893,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throw LocatedParseError("Invalid POSIX named class");
               };
               '\\Q' => {
-                  // fcall readQuotedClass;
-                  ostringstream str;
-                  str << "\\Q..\\E sequences in character classes not supported at index "
-                      << ts - ptr << ".";
-                  throw ParseError(str.str());
+                  fcall readQuotedClass;
               };
               '\\E' => { /*noop*/};
               # Backspace (this is only valid for \b in char classes)
@@ -1131,10 +1127,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
             inCharClassEarly = false;
         };
         # if we hit a quote before anything "real", handle it
-        #'\\Q' => { fcall readQuotedClass; };
-        '\\Q' => {
-            throw LocatedParseError("\\Q..\\E sequences in character classes not supported");
-        };
+        '\\Q' => { fcall readQuotedClass; };
         '\\E' => { /*noop*/};
 
         # time for the real work to happen
@@ -1170,6 +1163,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               # Literal character
               any => {
                   currentCls->add(*ts);
+                  inCharClassEarly = false;
               };
             *|;
 
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 3b13e064..fb2a2357 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -85,7 +85,6 @@
 84:/[=\]=]/ #Unsupported POSIX collating element at index 0.
 85:/A(?!)+Z/ #Invalid repeat at index 5.
 86:/\X/ #\X unsupported at index 0.
-87:/[a\Qz\E]/ #\Q..\E sequences in character classes not supported at index 2.
 88:/[A-\d]/ #Invalid range in character class at index 3.
 89:/[A-[:digit:]]/ #Invalid range in character class at index 3.
 90:/B[--[:digit:]--]+/ #Invalid range in character class at index 4.

From abbd54889959425c77689d30d1764079f62a8ecc Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 13 Nov 2015 14:36:28 +1100
Subject: [PATCH 36/54] ng_execute: update interface to use flat_set

This changes all the execute_graph() interfaces so that instead of
mutating a std::set of vertices, they accept an initial flat_set of
states and return a resultant flat_set of states after execution.

(Note that internally execute_graph() still uses bitsets)

This is both faster and more flexible.
---
 src/nfagraph/ng_execute.cpp     | 86 +++++++++++++++++----------------
 src/nfagraph/ng_execute.h       | 22 +++++----
 src/nfagraph/ng_som.cpp         | 15 +++---
 src/nfagraph/ng_som_util.cpp    |  8 +--
 src/rose/rose_build_compile.cpp | 17 ++++---
 5 files changed, 79 insertions(+), 69 deletions(-)

diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index aebfa712..92bef737 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -125,61 +125,62 @@ void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
 }
 
 static
-void fillStateBitset(const NGHolder &g, const set<NFAVertex> &in,
-                     dynamic_bitset<> &out) {
-    out.reset();
-    for (auto v : in) {
+dynamic_bitset<> makeStateBitset(const NGHolder &g,
+                                 const flat_set<NFAVertex> &in) {
+    dynamic_bitset<> work_states(num_vertices(g));
+    for (const auto &v : in) {
         u32 idx = g[v].index;
-        out.set(idx);
+        work_states.set(idx);
     }
+    return work_states;
 }
 
 static
-void fillVertexSet(const dynamic_bitset<> &in,
-                   const vector<StateInfo> &info, set<NFAVertex> &out) {
-    out.clear();
+flat_set<NFAVertex> getVertices(const dynamic_bitset<> &in,
+                                const vector<StateInfo> &info) {
+    flat_set<NFAVertex> out;
     for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
         out.insert(info[i].vertex);
     }
+    return out;
 }
 
 static
-void fillInfoTable(const NGHolder &g, vector<StateInfo> &info) {
-    info.resize(num_vertices(g));
+vector<StateInfo> makeInfoTable(const NGHolder &g) {
+    vector<StateInfo> info(num_vertices(g));
     for (auto v : vertices_range(g)) {
         u32 idx = g[v].index;
         const CharReach &cr = g[v].char_reach;
         assert(idx < info.size());
         info[idx] = StateInfo(v, cr);
     }
+    return info;
 }
 
-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   set<NFAVertex> *states, bool kill_sds) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial_states,
+                                  bool kill_sds) {
     assert(hasCorrectlyNumberedVertices(g));
 
-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);
 
     execute_graph_i(g, info, input, &work_states, kill_sds);
 
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
 
-void execute_graph(const NGHolder &g, const vector<CharReach> &input,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial_states) {
     assert(hasCorrectlyNumberedVertices(g));
 
-    vector<StateInfo> info;
-    fillInfoTable(g, info);
-    dynamic_bitset<> work_states(num_vertices(g));
-    fillStateBitset(g, *states, work_states);
+    auto info = makeInfoTable(g);
+    auto work_states = makeStateBitset(g, initial_states);
 
     execute_graph_i(g, info, input, &work_states, false);
 
-    fillVertexSet(work_states, info, *states);
+    return getVertices(work_states, info);
 }
 
 typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
@@ -276,9 +277,10 @@ private:
 };
 } // namespace
 
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   const set<NFAVertex> &input_start_states,
-                   set<NFAVertex> *states) {
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial_states) {
     DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
                  num_vertices(running_g), num_vertices(input_dag));
     assert(hasCorrectlyNumberedVertices(running_g));
@@ -290,10 +292,8 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
     RevNFAGraph revg(input_dag.g);
     map<NFAVertex, dynamic_bitset<> > dfs_states;
 
-    vector<StateInfo> info;
-    fillInfoTable(running_g, info);
-    dynamic_bitset<> input_fs(num_vertices(running_g));
-    fillStateBitset(running_g, *states, input_fs);
+    auto info = makeInfoTable(running_g);
+    auto input_fs = makeStateBitset(running_g, initial_states);
 
     for (auto v : input_start_states) {
         dfs_states[v] = input_fs;
@@ -303,21 +303,25 @@ void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
                       eg_visitor(running_g, info, input_dag, dfs_states),
                       make_assoc_property_map(colours));
 
-    fillVertexSet(dfs_states[input_dag.accept], info, *states);
+    auto states = getVertices(dfs_states[input_dag.accept], info);
 
 #ifdef DEBUG
-        DEBUG_PRINTF("  output rstates:");
-        for (auto v : *states) {
-            printf(" %u", running_g[v].index);
-        }
-        printf("\n");
+    DEBUG_PRINTF("  output rstates:");
+    for (const auto &v : states) {
+        printf(" %u", running_g[v].index);
+    }
+    printf("\n");
 #endif
+
+    return states;
 }
 
-void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
-                   set<NFAVertex> *states) {
-    set<NFAVertex> input_start_states = {input_dag.start, input_dag.startDs};
-    execute_graph(running_g, input_dag, input_start_states, states);
+flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
+                                  const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial_states) {
+    auto input_start_states = {input_dag.start, input_dag.startDs};
+    return execute_graph(running_g, input_dag, input_start_states,
+                         initial_states);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_execute.h b/src/nfagraph/ng_execute.h
index 80fdcbd5..e2c7c72d 100644
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -35,8 +35,8 @@
 #define NG_EXECUTE_H
 
 #include "ng_holder.h"
+#include "util/ue2_containers.h"
 
-#include <set>
 #include <vector>
 
 namespace ue2 {
@@ -44,23 +44,25 @@ namespace ue2 {
 class CharReach;
 struct ue2_literal;
 
-void execute_graph(const NGHolder &g, const ue2_literal &input,
-                   std::set<NFAVertex> *states, bool kill_sds = false);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
+                                  const flat_set<NFAVertex> &initial,
+                                  bool kill_sds = false);
 
-void execute_graph(const NGHolder &g, const std::vector<CharReach> &input,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g,
+                                  const std::vector<CharReach> &input,
+                                  const flat_set<NFAVertex> &initial);
 
 /** on exit, states contains any state which may still be enabled after
  * receiving an input which corresponds to some path through the input_dag from
  * start or startDs to accept. input_dag MUST be acyclic aside from self-loops.
  */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &initial);
 
 /* as above, but able to specify the source states for the input graph */
-void execute_graph(const NGHolder &g, const NGHolder &input_dag,
-                   const std::set<NFAVertex> &input_start_states,
-                   std::set<NFAVertex> *states);
+flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
+                                  const flat_set<NFAVertex> &input_start_states,
+                                  const flat_set<NFAVertex> &initial);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 90ebb5c3..f26b62aa 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -266,7 +266,7 @@ bool validateEXSL(const NGHolder &g,
     const vector<CharReach> escapes_vec(1, escapes);
     const vector<CharReach> notescapes_vec(1, ~escapes);
 
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states past the prefix */
     DEBUG_PRINTF("region %u is cutover\n", region);
     for (auto v : vertices_range(g)) {
@@ -276,20 +276,20 @@ bool validateEXSL(const NGHolder &g,
     }
 
     /* process the escapes */
-    execute_graph(g, escapes_vec, &states);
+    states = execute_graph(g, escapes_vec, states);
 
     /* flood with any number of not escapes */
-    set<NFAVertex> prev_states;
+    ue2::flat_set<NFAVertex> prev_states;
     while (prev_states != states) {
         prev_states = states;
-        execute_graph(g, notescapes_vec, &states);
+        states = execute_graph(g, notescapes_vec, states);
         insert(&states, prev_states);
     }
 
     /* find input starts to use for when we are running the prefix through as
      * when the escape character arrives we may be in matching the prefix
      * already */
-    set<NFAVertex> prefix_start_states;
+    ue2::flat_set<NFAVertex> prefix_start_states;
     for (auto v : vertices_range(prefix)) {
         if (v != prefix.accept && v != prefix.acceptEod
             /* and as we have already made it past the prefix once */
@@ -298,11 +298,12 @@ bool validateEXSL(const NGHolder &g,
         }
     }
 
-    execute_graph(prefix, escapes_vec, &prefix_start_states);
+    prefix_start_states =
+        execute_graph(prefix, escapes_vec, prefix_start_states);
 
     assert(contains(prefix_start_states, prefix.startDs));
     /* see what happens after we feed it the prefix */
-    execute_graph(g, prefix, prefix_start_states, &states);
+    states = execute_graph(g, prefix, prefix_start_states, states);
 
     for (auto v : states) {
         assert(v != g.accept && v != g.acceptEod); /* no cr -> should never be
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 7f487f89..a0829451 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -136,7 +136,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
         return false;
     }
 
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states (except starts - avoid suffix matches) */
     /* If we were doing (1) we would also except states leading to accepts -
        avoid prefix matches */
@@ -149,7 +149,7 @@ bool firstMatchIsFirst(const NGHolder &p) {
     }
 
     /* run the prefix the main graph */
-    execute_graph(p, p, &states);
+    states = execute_graph(p, p, states);
 
     for (auto v : states) {
         /* need to check if this vertex may represent an infix match - ie
@@ -313,7 +313,7 @@ bool sentClearsTail(const NGHolder &g,
      */
 
     u32 first_bad_region = ~0U;
-    set<NFAVertex> states;
+    ue2::flat_set<NFAVertex> states;
     /* turn on all states */
     DEBUG_PRINTF("region %u is cutover\n", last_head_region);
     for (auto v : vertices_range(g)) {
@@ -327,7 +327,7 @@ bool sentClearsTail(const NGHolder &g,
     }
 
     /* run the prefix the main graph */
-    execute_graph(g, sent, &states);
+    states = execute_graph(g, sent, states);
 
     /* .. and check if we are left with anything in the tail region */
     for (auto v : states) {
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 34e76269..a2bd971e 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1631,20 +1631,23 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
     assert(left.graph());
     const NGHolder &h = *left.graph();
 
+    ue2::flat_set<NFAVertex> all_states;
+    insert(&all_states, vertices(h));
+    assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
+    DEBUG_PRINTF("removing sds\n");
+    all_states.erase(h.startDs);
+
+    ue2::flat_set<NFAVertex> states;
+
     /* check each pred literal to see if they all kill previous graph
      * state */
     for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
         const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
 
-        set<NFAVertex> states;
-        insert(&states, vertices(h));
-        assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */
-        DEBUG_PRINTF("removing sds\n");
-        states.erase(h.startDs);
         DEBUG_PRINTF("running graph %zu\n", states.size());
-        execute_graph(h, s, &states, true);
-        DEBUG_PRINTF("ran\n");
+        states = execute_graph(h, s, all_states, true);
+        DEBUG_PRINTF("ran, %zu states on\n", states.size());
 
         if (!states.empty()) {
             return false;

From 313822c15741487de28c29f05e25dbd368f95424 Mon Sep 17 00:00:00 2001
From: Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>
Date: Tue, 17 Nov 2015 17:50:23 +0000
Subject: [PATCH 37/54] FDR runtime simplification

Removed static specialisation of domains.
---
 src/fdr/autogen.py                 |  11 +--
 src/fdr/fdr.c                      |  21 -----
 src/fdr/fdr_autogen.py             |  84 +++++++++----------
 src/fdr/fdr_compile.cpp            |  12 +++
 src/fdr/fdr_dump.cpp               |   1 +
 src/fdr/fdr_engine_description.cpp | 127 +++++++++++++++--------------
 src/fdr/fdr_engine_description.h   |   1 -
 src/fdr/fdr_internal.h             |   6 +-
 8 files changed, 124 insertions(+), 139 deletions(-)

diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py
index 36e4c16c..e5b4f39e 100755
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
 
 def build_fdr_matchers():
     all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
-    big_domains = [ 14, 15 ]
+    strides = [ 1, 2, 4 ]
 
     common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
-        all_matchers += [ M3(stride = 2, domain = d, **common) ]
-        all_matchers += [ M3(stride = 4, domain = d, **common) ]
-    for d in big_domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+    for s in strides:
+        all_matchers += [ M3(stride = s, **common) ]
 
     return all_matchers
 
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 082800f1..f83a4265 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -40,27 +40,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_streaming_runtime.h"
 #include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
-    u32 r = 0;
-    if (a->start_offset == 0) {
-        if (numBits <= 8) {
-            r = a->buf_history[a->len_history - 1];
-        } else {
-            r = a->buf_history[a->len_history - 1];
-            r |= (a->buf[0] << 8);
-        }
-    } else {
-        if (numBits <= 8) {
-            r = a->buf[a->start_offset - 1];
-        } else {
-            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
-        }
-    }
-    return r & ((1 << numBits) - 1);
-}
-
 #include "fdr_autogen.c"
 
 #define FAKE_HISTORY_SIZE 16
diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py
index 685cca3b..748d811f 100755
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@@ -74,12 +74,12 @@ class ValueExtractStep(Step):
         dsb = m.datasize_bytes
         modval = offset % dsb
 
-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
             # Case 1: reading more than one byte over the end of the bulk load
 
             self.latency = 4
             if sub_load_cautious:
-                code_string = "cautious_forward" 
+                code_string = "cautious_forward"
             else:
                 code_string = "normal"
             load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                     temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
 
 
-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
         v_var = self.nv(m.value_extract_type, "v%d" % offset)
         self.val = v_var.gen_initializer_stmt(init_string)
 
@@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                           enable_confirmless = m.stride == 1, do_bailout = False)
 
 class M3(MatcherBase):
-    def get_hash_safety_parameters(self):
-        h_size = self.single_load_type.size_in_bytes()
-        return (0, h_size - 1)
-
     def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
               self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
               self.arch.target, self.conf_pull_back, self.conf_top_level_split)
 
     def produce_main_loop(self, switch_variant = False):
@@ -192,8 +188,8 @@ class M3(MatcherBase):
         ctxt = CodeGenContext(self)
 
         if switch_variant:
-            print " ptr -= (iterBytes - dist);"
-            print " { " # need an extra scope around switch variant to stop its globals escaping
+            print "    ptr -= (iterBytes - dist);"
+            print "    { " # need an extra scope around switch variant to stop its globals escaping
         else:
             print "    if (doMainLoop) {"
             print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@@ -349,25 +345,30 @@ class M3(MatcherBase):
         shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
 
         s = Template("""
-            $TYPENAME s;
-            if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
-                s = *((const $TYPENAME *)ft + tmp);
-                $SHIFT_EXPR;
-            } else {
-                s = *(const $TYPENAME *)&fdr->start;
-            }
+    $TYPENAME s;
+    if (a->len_history) {
+        u32 tmp = 0;
+        if (a->start_offset == 0) {
+            tmp = a->buf_history[a->len_history - 1];
+            tmp |= (a->buf[0] << 8);
+        } else {
+            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+        tmp &= fdr->domainMask;
+        s = *((const $TYPENAME *)ft + tmp);
+        $SHIFT_EXPR;
+    } else {
+        s = *(const $TYPENAME *)&fdr->start;
+    }
 """).substitute(TYPENAME = s_type.get_name(),
                 ZERO_EXPR = s_type.zero_expression(),
-                DOMAIN = self.domain,
                 SHIFT_EXPR = shift_expr)
         return s
 
     def produce_code(self):
 
-        (behind, ahead) = self.get_hash_safety_parameters()
-        loop_read_behind = behind
-        loop_read_ahead = self.loop_bytes + ahead
+        loop_read_behind = 0
+        loop_read_ahead = self.loop_bytes + 1
 
         # we set up mask and shift stuff for extracting our masks from registers
         #
@@ -380,7 +381,7 @@ class M3(MatcherBase):
         ssb = self.state_type.size / 8 # state size in bytes
 
         # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
             # obscure corner - we don't have the room in the register to
             # do this for all values so we don't. domain==16 is pretty
             # bad anyhow, of course
@@ -390,7 +391,6 @@ class M3(MatcherBase):
 
         shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
         self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
-        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
 
         print self.produce_header(visible = False)
 
@@ -398,21 +398,19 @@ class M3(MatcherBase):
         print " Arch: " + self.arch.name,
         print " State type: " + self.state_type.get_name(),
         print " Num buckets: %d" % self.num_buckets,
-        print " Domain: %d" % self.domain,
         print " Stride: %d" % self.stride
 
         print self.produce_common_declarations()
-        print
 
-        print "\tconst size_t tabSize = %d;" % self.table_size
-        print """
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
+        print
+        print "    u64a domain_mask = fdr->domainMask;"
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
         print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
 
         print """
     while (ptr < buf + len) {
@@ -451,9 +449,9 @@ class M3(MatcherBase):
         print self.produce_footer()
 
     def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
 
-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                  arch,
                  table_state_width = None,
                  num_buckets = 8,
@@ -474,17 +472,9 @@ class M3(MatcherBase):
         self.table_state_width = state_width
         self.table_state_type = getRequiredType(self.table_state_width)
 
-        # domain is the number of bits that we draw from our input to
-        # index our 'reach' table
-        if not 8 <= domain <= 16:
-            fail_out("Unsupported domain: %d" % domain)
-        self.domain = domain
-        # this is the load type required for this domain if we want to
+        # this is the load type required for domain [9:15] if we want to
         # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
-
-        # table size
-        self.table_size = 2**domain * table_state_width // 8
+        self.single_load_type = IntegerType(16)
 
         # stride is the frequency with which we make data-driven
         # accesses to our reach table
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8be44370..ccf17626 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     ptr += floodControlTmp.second;
     aligned_free(floodControlTmp.first);
 
+    /*  we are allowing domains 9 to 15 only */
+    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->domain = eng.bits;
+    fdr->schemeWidthByte = eng.schemeWidth / 8;
+    fdr->domainMask = (1 << eng.bits) - 1;
+    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
     if (link.first) {
         fdr->link = verify_u32(ptr - fdr_base);
         memcpy(ptr, link.first, link.second);
@@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
         return nullptr;
     }
 
+    // temporary hack for unit testing
+    if (hint != HINT_INVALID) {
+        des->bits = 9;
+    }
+
     FDRCompiler fc(lits, *des, make_small);
     return fc.build(link);
 }
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index ae246270..158170c2 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
         unique_ptr<FDREngineDescription> des =
             getFdrDescription(fdr->engineID);
         if (des) {
+            fprintf(f, "    domain     %u\n", des->bits);
             fprintf(f, "    stride     %u\n", des->stride);
             fprintf(f, "    buckets    %u\n", des->getNumBuckets());
             fprintf(f, "    width      %u\n", des->schemeWidth);
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 2a6fda79..5d470c7e 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
     : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                         def.numBuckets, def.confirmPullBackDistance,
                         def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
 
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
     // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
     DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                  desiredStride);
 
-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
     u32 best_score = 0;
 
-    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
-        if (!eng.isValidOnTarget(target)) {
-            continue;
-        }
-        if (msl < eng.stride) {
-            continue;
-        }
-
-        u32 score = 100;
-
-        score -= absdiff(desiredStride, eng.stride);
-
-        if (eng.stride <= desiredStride) {
-            score += eng.stride;
-        }
-
-        u32 effLits = vl.size(); /* * desiredStride;*/
-        u32 ideal;
-        if (effLits < eng.getNumBuckets()) {
-            if (eng.stride == 1) {
-                ideal = 8;
-            } else {
-                ideal = 10;
+    for (u32 domain = 9; domain <= 15; domain++) {
+        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+            // to make sure that domains >=14 have stride 1 according to origin
+            if (domain > 13 && engineID > 0) {
+                continue;
+            }
+            FDREngineDescription &eng = allDescs[engineID];
+            if (!eng.isValidOnTarget(target)) {
+                continue;
+            }
+            if (msl < eng.stride) {
+                continue;
             }
-        } else if (effLits < 20) {
-            ideal = 10;
-        } else if (effLits < 100) {
-            ideal = 11;
-        } else if (effLits < 1000) {
-            ideal = 12;
-        } else if (effLits < 10000) {
-            ideal = 13;
-        } else {
-            ideal = 15;
-        }
 
-        if (ideal != 8 && eng.schemeWidth == 32) {
-            ideal += 1;
-        }
+            u32 score = 100;
 
-        if (make_small) {
-            ideal -= 2;
-        }
+            score -= absdiff(desiredStride, eng.stride);
 
-        if (eng.stride > 1) {
-            ideal++;
-        }
+            if (eng.stride <= desiredStride) {
+                score += eng.stride;
+            }
 
-        DEBUG_PRINTF("effLits %u\n", effLits);
+            u32 effLits = vl.size(); /* * desiredStride;*/
+            u32 ideal;
+            if (effLits < eng.getNumBuckets()) {
+                if (eng.stride == 1) {
+                    ideal = 8;
+                } else {
+                    ideal = 10;
+                }
+            } else if (effLits < 20) {
+                ideal = 10;
+            } else if (effLits < 100) {
+                ideal = 11;
+            } else if (effLits < 1000) {
+                ideal = 12;
+            } else if (effLits < 10000) {
+                ideal = 13;
+            } else {
+                ideal = 15;
+            }
 
-        if (target.is_atom_class() && !make_small && effLits < 4000) {
-            /* Unless it is a very heavy case, we want to build smaller tables
-             * on lightweight machines due to their small caches. */
-            ideal -= 2;
-        }
+            if (ideal != 8 && eng.schemeWidth == 32) {
+                ideal += 1;
+            }
 
-        score -= absdiff(ideal, eng.bits);
+            if (make_small) {
+                ideal -= 2;
+            }
 
-        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
-                     "-> score=%u\n",
-                     eng.getID(), eng.schemeWidth, eng.bits,
-                     eng.getNumBuckets(), eng.stride, score);
+            if (eng.stride > 1) {
+                ideal++;
+            }
 
-        if (!best || score > best_score) {
-            best = &eng;
-            best_score = score;
+            DEBUG_PRINTF("effLits %u\n", effLits);
+
+            if (target.is_atom_class() && !make_small && effLits < 4000) {
+                /* Unless it is a very heavy case, we want to build smaller tables
+                 * on lightweight machines due to their small caches. */
+                ideal -= 2;
+            }
+
+            score -= absdiff(ideal, domain);
+
+            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                         "-> score=%u\n",
+                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getNumBuckets(), eng.stride, score);
+
+            if (!best || score > best_score) {
+                eng.bits = domain;
+                best = &eng;
+                best_score = score;
+            }
         }
     }
 
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h
index d936095b..45f64ac0 100644
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -43,7 +43,6 @@ struct FDREngineDef {
     u32 schemeWidth;
     u32 numBuckets;
     u32 stride;
-    u32 bits;
     u64a cpu_features;
     u32 confirmPullBackDistance;
     u32 confirmTopLevelSplit;
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index 6c722777..607e039c 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -76,9 +76,11 @@ struct FDR {
      * structures (spillover strings and hash table) if we're a secondary
      * structure. */
     u32 link;
+    u8 domain; /* dynamic domain info */
+    u8 schemeWidthByte;  /* scheme width in bytes */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
     u32 pad1;
-    u32 pad2;
-    u32 pad3;
 
     union {
         u32 s_u32;

From bdb7a100344d5e500081d061ea42f24e532e1ba4 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 16 Nov 2015 16:43:43 +1100
Subject: [PATCH 38/54] Fix defn of POSIX graph, print, punct classes

The POSIX classes [:graph:], [:print:] and [:punct:] are handled
specially in UCP mode by PCRE. This change matches that behaviour.
---
 src/parser/ComponentClass.cpp     |  7 ++++++-
 src/parser/ComponentClass.h       |  4 +++-
 src/parser/Utf8ComponentClass.cpp | 24 +++++++++++++++++++++++-
 src/parser/Utf8ComponentClass.h   |  3 +++
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp
index 43c05898..a91ae979 100644
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@@ -81,8 +81,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
     case CLASS_DIGIT:
         return number;
     case CLASS_GRAPH:
-    case CLASS_XGRAPH:
         return CharReach(0x21, 0x7e);
+    case CLASS_XGRAPH:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_HORZ:
         return CharReach("\x09\x20\xA0");
     case CLASS_LOWER:
@@ -93,11 +94,15 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
         }
     case CLASS_PRINT:
         return CharReach(0x20, 0x7e);
+    case CLASS_XPRINT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_PUNCT:
         return CharReach(0x21, '0' - 1)
             | CharReach('9' + 1, 'A' - 1)
             | CharReach('Z' + 1, 'a' - 1)
             | CharReach('z' + 1, 126);
+    case CLASS_XPUNCT:
+        return to_cr(getPredefinedCodePointSet(c, mode));
     case CLASS_SPACE:
         return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
     case CLASS_UPPER:
diff --git a/src/parser/ComponentClass.h b/src/parser/ComponentClass.h
index 1cb1a7d0..040e6d78 100644
--- a/src/parser/ComponentClass.h
+++ b/src/parser/ComponentClass.h
@@ -63,7 +63,9 @@ enum PredefinedClass {
     CLASS_VERT,
     CLASS_WORD,
     CLASS_XDIGIT,
-    CLASS_XGRAPH,
+    CLASS_XGRAPH, /* [:graph:] in UCP mode */
+    CLASS_XPRINT, /* [:print:] in UCP mode */
+    CLASS_XPUNCT, /* [:punct:] in UCP mode */
     CLASS_UCP_C,
     CLASS_UCP_CC,
     CLASS_UCP_CF,
diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index 3a6a85a4..54f9edb9 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -75,6 +75,10 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
         } else {
             return CLASS_UCP_LL;
         }
+    case CLASS_PRINT:
+        return CLASS_XPRINT;
+    case CLASS_PUNCT:
+        return CLASS_XPUNCT;
     case CLASS_SPACE:
         return CLASS_UCP_XPS;
     case CLASS_UPPER:
@@ -90,7 +94,6 @@ PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
     }
 }
 
-static
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                        const ParseMode &mode) {
     /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
@@ -117,6 +120,25 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
         rv |= cf;
         return rv;
     }
+    case CLASS_XPRINT: {
+        // Same as graph, plus everything with the Zs property.
+        CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
+        rv |= getUcpZs();
+        return rv;
+    }
+    case CLASS_XPUNCT: {
+        // Everything with the P (punctuation) property, plus code points in S
+        // (symbols) that are < 128.
+        // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
+        // cut-off here, so we are compatible with that for now. PCRE bug #1718
+        // tracks this; once PCRE 8.38 is released we should correct this
+        // behaviour.
+        CodePointSet rv = getUcpP();
+        CodePointSet symbols = getUcpS();
+        symbols.unsetRange(256, MAX_UNICODE);
+        rv |= symbols;
+        return rv;
+    }
     case CLASS_HORZ: {
         CodePointSet rv;
         rv.set(0x0009); /* Horizontal tab */
diff --git a/src/parser/Utf8ComponentClass.h b/src/parser/Utf8ComponentClass.h
index b2c402f9..3d21a278 100644
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@@ -110,6 +110,9 @@ private:
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
 bool isUcp(PredefinedClass c);
 
+CodePointSet getPredefinedCodePointSet(PredefinedClass c,
+                                       const ParseMode &mode);
+
 } // namespace
 
 #endif // UTF8_COMPONENT_CLASS_H

From 25a01e1c3cb31b6bde6c83b82ad41a87fd0f0deb Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 17 Nov 2015 17:23:52 +1100
Subject: [PATCH 39/54] Unify handling of caseless flag in class parser

Apply caselessness to each element added to a class, rather than all at
finalize time (which required separated ucp dnf and-ucp working data).

Unifies the behaviour of AsciiComponentClass and Utf8ComponentClass in
this respect.
---
 src/parser/AsciiComponentClass.cpp | 32 +++++++++----------
 src/parser/AsciiComponentClass.h   |  4 +--
 src/parser/Utf8ComponentClass.cpp  | 51 ++++++------------------------
 src/parser/Utf8ComponentClass.h    |  2 --
 4 files changed, 27 insertions(+), 62 deletions(-)

diff --git a/src/parser/AsciiComponentClass.cpp b/src/parser/AsciiComponentClass.cpp
index 44ecb5bb..7cfa6e11 100644
--- a/src/parser/AsciiComponentClass.cpp
+++ b/src/parser/AsciiComponentClass.cpp
@@ -61,11 +61,15 @@ void AsciiComponentClass::createRange(unichar to) {
     unsigned char from = (u8)range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        cr.setRange(from, to);
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CharReach ncr(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+    cr |= ncr;
+    range_start = INVALID_UNICODE;
 }
 
 void AsciiComponentClass::notePositions(GlushkovBuildState &bs) {
@@ -95,16 +99,13 @@ void AsciiComponentClass::add(PredefinedClass c, bool negative) {
         c = translateForUcpMode(c, mode);
     }
 
+    // Note: caselessness is handled by getPredefinedCharReach.
     CharReach pcr = getPredefinedCharReach(c, mode);
     if (negative) {
         pcr.flip();
     }
 
-    if (isUcp(c)) {
-        cr_ucp |= pcr;
-    } else {
-        cr |= pcr;
-    }
+    cr |= pcr;
     range_start = INVALID_UNICODE;
     in_cand_range = false;
 }
@@ -120,7 +121,12 @@ void AsciiComponentClass::add(unichar c) {
         return;
     }
 
-    cr.set(c);
+    CharReach ncr(c, c);
+    if (mode.caseless) {
+        make_caseless(&ncr);
+    }
+
+    cr |= ncr;
     range_start = c;
 }
 
@@ -136,12 +142,6 @@ void AsciiComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    if (mode.caseless) {
-        make_caseless(&cr);
-    }
-
-    cr |= cr_ucp; /* characters from ucp props don't participate in caseless */
-
     if (m_negate) {
         cr.flip();
     }
diff --git a/src/parser/AsciiComponentClass.h b/src/parser/AsciiComponentClass.h
index 2d5ef843..925fa9bf 100644
--- a/src/parser/AsciiComponentClass.h
+++ b/src/parser/AsciiComponentClass.h
@@ -78,12 +78,10 @@ protected:
 private:
     Position position;
     CharReach cr;
-    CharReach cr_ucp;
 
     // Private copy ctor. Use clone instead.
     AsciiComponentClass(const AsciiComponentClass &other)
-        : ComponentClass(other), position(other.position), cr(other.cr),
-          cr_ucp(other.cr_ucp) {}
+        : ComponentClass(other), position(other.position), cr(other.cr) {}
 };
 
 } // namespace ue2
diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index 54f9edb9..21707902 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -515,16 +515,16 @@ void UTF8ComponentClass::createRange(unichar to) {
     unichar from = range_start;
     if (from > to) {
         throw LocatedParseError("Range out of order in character class");
-    } else {
-        in_cand_range = false;
-        CodePointSet ncps;
-        ncps.setRange(from, to);
-        if (mode.caseless) {
-            make_caseless(&ncps);
-        }
-        cps |= ncps;
-        range_start = INVALID_UNICODE;
     }
+
+    in_cand_range = false;
+    CodePointSet ncps;
+    ncps.setRange(from, to);
+    if (mode.caseless) {
+        make_caseless(&ncps);
+    }
+    cps |= ncps;
+    range_start = INVALID_UNICODE;
 }
 
 void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
@@ -543,11 +543,7 @@ void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
         pcps.flip();
     }
 
-    if (isUcp(c)) {
-        cps_ucp |= pcps;
-    } else {
-        cps |= pcps;
-    }
+    cps |= pcps;
 
     range_start = INVALID_UNICODE;
     in_cand_range = false;
@@ -585,8 +581,6 @@ void UTF8ComponentClass::finalize() {
         in_cand_range = false;
     }
 
-    cps |= cps_ucp; /* characters from ucp props always case sensitive */
-
     if (m_negate) {
         cps.flip();
     }
@@ -594,31 +588,6 @@ void UTF8ComponentClass::finalize() {
     finalized = true;
 }
 
-bool isUcp(PredefinedClass c) {
-    switch (c) {
-    case CLASS_ALNUM:
-    case CLASS_ALPHA:
-    case CLASS_ANY:
-    case CLASS_ASCII:
-    case CLASS_BLANK:
-    case CLASS_CNTRL:
-    case CLASS_DIGIT:
-    case CLASS_GRAPH:
-    case CLASS_HORZ:
-    case CLASS_LOWER:
-    case CLASS_PRINT:
-    case CLASS_PUNCT:
-    case CLASS_SPACE:
-    case CLASS_UPPER:
-    case CLASS_VERT:
-    case CLASS_WORD:
-    case CLASS_XDIGIT:
-        return false;
-    default:
-        return true;
-    }
-}
-
 Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
     map<u8, Position>::const_iterator it = heads.find(first_byte);
     if (it != heads.end()) {
diff --git a/src/parser/Utf8ComponentClass.h b/src/parser/Utf8ComponentClass.h
index 3d21a278..f4e7ea32 100644
--- a/src/parser/Utf8ComponentClass.h
+++ b/src/parser/Utf8ComponentClass.h
@@ -93,7 +93,6 @@ private:
     void buildFourByte(GlushkovBuildState &bs);
 
     CodePointSet cps;
-    CodePointSet cps_ucp;
 
     std::map<u8, Position> heads;
     Position single_pos;
@@ -108,7 +107,6 @@ private:
 };
 
 PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode);
-bool isUcp(PredefinedClass c);
 
 CodePointSet getPredefinedCodePointSet(PredefinedClass c,
                                        const ParseMode &mode);

From f9b7e806b1ae0067a45f952fb68b77196dc974b0 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 25 Nov 2015 14:53:27 +1100
Subject: [PATCH 40/54] Update defn of class [:punct:] for PCRE 8.38

---
 src/parser/Utf8ComponentClass.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index 21707902..b6b161e8 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -129,13 +129,9 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
     case CLASS_XPUNCT: {
         // Everything with the P (punctuation) property, plus code points in S
         // (symbols) that are < 128.
-        // NOTE: PCRE versions 8.37 and earlier erroneously use 256 as the
-        // cut-off here, so we are compatible with that for now. PCRE bug #1718
-        // tracks this; once PCRE 8.38 is released we should correct this
-        // behaviour.
         CodePointSet rv = getUcpP();
         CodePointSet symbols = getUcpS();
-        symbols.unsetRange(256, MAX_UNICODE);
+        symbols.unsetRange(128, MAX_UNICODE);
         rv |= symbols;
         return rv;
     }

From 205bc1af7f4111b7757bc2646528df34fb5e54bc Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 25 Nov 2015 17:05:36 +1100
Subject: [PATCH 41/54] PCRE includes U+180E in /[:print:]/8W

---
 src/parser/Utf8ComponentClass.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index b6b161e8..cdfc974a 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -124,6 +124,7 @@ CodePointSet getPredefinedCodePointSet(PredefinedClass c,
         // Same as graph, plus everything with the Zs property.
         CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
         rv |= getUcpZs();
+        rv.set(0x180e); // Also included in this class by PCRE 8.38.
         return rv;
     }
     case CLASS_XPUNCT: {

From 15c2980948be285b9051b609358d37e6acc397fb Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 27 Nov 2015 13:30:59 +1100
Subject: [PATCH 42/54] Make key 64 bits where large shifts may be used.

This fixes a long-standing issue with large multibit structures.
---
 src/util/multibit.c         | 18 +++++-----
 src/util/multibit.h         | 35 ++++++++++---------
 unit/internal/multi_bit.cpp | 69 ++++++++++++++++++++-----------------
 3 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/src/util/multibit.c b/src/util/multibit.c
index ca5e5656..c22b73ff 100644
--- a/src/util/multibit.c
+++ b/src/util/multibit.c
@@ -142,23 +142,25 @@ const u32 mmbit_root_offset_from_level[7] = {
 u32 mmbit_size(u32 total_bits) {
     MDEBUG_PRINTF("%u\n", total_bits);
 
-    // UE-2228: multibit has bugs in very, very large cases that we should be
-    // protected against at compile time by resource limits.
-    assert(total_bits <= 1U << 30);
-
     // Flat model multibit structures are just stored as a bit vector.
     if (total_bits <= MMB_FLAT_MAX_BITS) {
         return ROUNDUP_N(total_bits, 8) / 8;
     }
 
-    u32 current_level = 1;
-    u32 total = 0;
+    u64a current_level = 1; // Number of blocks on current level.
+    u64a total = 0;         // Total number of blocks.
     while (current_level * MMB_KEY_BITS < total_bits) {
         total += current_level;
         current_level <<= MMB_KEY_SHIFT;
     }
-    total += (total_bits + MMB_KEY_BITS - 1)/MMB_KEY_BITS;
-    return sizeof(MMB_TYPE) * total;
+
+    // Last level is a one-for-one bit vector. It needs room for total_bits
+    // elements, rounded up to the nearest block.
+    u64a last_level = ((u64a)total_bits + MMB_KEY_BITS - 1) / MMB_KEY_BITS;
+    total += last_level;
+
+    assert(total * sizeof(MMB_TYPE) <= UINT32_MAX);
+    return (u32)(total * sizeof(MMB_TYPE));
 }
 
 #ifdef DUMP_SUPPORT
diff --git a/src/util/multibit.h b/src/util/multibit.h
index 251316b3..771c158d 100644
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@@ -235,18 +235,18 @@ const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) {
 /** \brief get the block for this key on the current level as a u8 ptr */
 static really_inline
 u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
-    return mmbit_get_level_root(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    u8 *level_root = mmbit_get_level_root(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 
 /** \brief get the block for this key on the current level as a const u8 ptr */
 static really_inline
 const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level,
                                     u32 key) {
-    return mmbit_get_level_root_const(bits, level) +
-           (key >> (mmbit_get_ks(max_level, level) + MMB_KEY_SHIFT)) *
-               sizeof(MMB_TYPE);
+    const u8 *level_root = mmbit_get_level_root_const(bits, level);
+    u32 ks = mmbit_get_ks(max_level, level);
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
 }
 
 /** \brief get the _byte_ for this key on the current level as a u8 ptr */
@@ -254,7 +254,7 @@ static really_inline
 u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) {
     u8 *level_root = mmbit_get_level_root(bits, level);
     u32 ks = mmbit_get_ks(max_level, level);
-    return level_root + (key >> (ks + MMB_KEY_SHIFT - 3));
+    return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3));
 }
 
 /** \brief get our key value for the current level */
@@ -721,11 +721,11 @@ u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
 }
 
 static really_inline
-MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
-                         u32 block_base) {
+MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
+                         u64a block_base) {
     const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
-    u32 lshift = (block_min - block_base) >> level_shift;
-    u32 ushift = (block_max - block_base) >> level_shift;
+    u64a lshift = (block_min - block_base) >> level_shift;
+    u64a ushift = (block_max - block_base) >> level_shift;
     MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0;
     MMB_TYPE umask =
         ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES;
@@ -734,7 +734,7 @@ MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u32 block_min, u32 block_max,
 
 static really_inline
 u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
-    u32 key = 0;
+    u64a key = 0;
     u32 ks = mmbit_keyshift(total_bits);
     const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
     u32 level = 0;
@@ -743,9 +743,9 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
         assert(level <= max_level);
 
         u32 block_width = MMB_KEY_BITS << ks;
-        u32 block_base = key*block_width;
-        u32 block_min = MAX(it_start, block_base);
-        u32 block_max = MIN(it_end, block_base + block_width - 1);
+        u64a block_base = key * block_width;
+        u64a block_min = MAX(it_start, block_base);
+        u64a block_max = MIN(it_end, block_base + block_width - 1);
         const u8 *block_ptr =
             mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
         MMB_TYPE block = mmb_load(block_ptr);
@@ -761,13 +761,14 @@ u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32
             // No bit found, go up a level
             // we know that this block didn't have any answers, so we can push
             // our start iterator forward.
-            it_start = block_base + block_width;
-            if (it_start > it_end) {
+            u64a next_start = block_base + block_width;
+            if (next_start > it_end) {
                 break;
             }
             if (level-- == 0) {
                 break;
             }
+            it_start = next_start;
             key >>= MMB_KEY_SHIFT;
             ks += MMB_KEY_SHIFT;
         }
diff --git a/unit/internal/multi_bit.cpp b/unit/internal/multi_bit.cpp
index 27725219..3f5c5908 100644
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@@ -363,7 +363,9 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
     ASSERT_TRUE(ba != nullptr);
 
     // Set one bit on and run some checks.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
+        SCOPED_TRACE(i);
+
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
 
@@ -381,7 +383,12 @@ TEST_P(MultiBitTest, BoundedIteratorSingle) {
 
         // Scanning from one past our bit to the end should find nothing.
         if (i != test_size - 1) {
-            ASSERT_EQ(MMB_INVALID, mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
+            // Ordinary iterator.
+            ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, i));
+
+            // Bounded iterator.
+            ASSERT_EQ(MMB_INVALID,
+                      mmbit_iterate_bounded(ba, test_size, i + 1, test_size));
         }
     }
 }
@@ -393,7 +400,7 @@ TEST_P(MultiBitTest, BoundedIteratorAll) {
     // Switch everything on.
     fill_mmbit(ba, test_size);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         if (i != 0) {
             ASSERT_EQ(0U, mmbit_iterate_bounded(ba, test_size, 0, i));
         }
@@ -408,13 +415,13 @@ TEST_P(MultiBitTest, BoundedIteratorEven) {
 
     // Set every even-numbered bit and see what we can see.
     mmbit_clear(ba, test_size);
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
         mmbit_set(ba, test_size, i);
     }
 
     u32 even_stride = stride % 2 ? stride + 1 : stride;
 
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
         // Scanning from each even bit to the end should find itself.
         ASSERT_EQ(i, mmbit_iterate_bounded(ba, test_size, i, test_size));
 
@@ -439,13 +446,13 @@ TEST_P(MultiBitTest, BoundedIteratorOdd) {
 
     // Set every odd-numbered bit and see what we can see.
     mmbit_clear(ba, test_size);
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
         mmbit_set(ba, test_size, i);
     }
 
     u32 even_stride = stride % 2 ? stride + 1 : stride;
 
-    for (u32 i = 0; i < test_size; i += even_stride) {
+    for (u64a i = 0; i < test_size; i += even_stride) {
         // Scanning from each even bit to the end should find i+1.
         if (i+1 < test_size) {
             ASSERT_EQ(i+1, mmbit_iterate_bounded(ba, test_size, i, test_size));
@@ -473,7 +480,7 @@ TEST_P(MultiBitTest, Set) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
 
         // set a bit that wasn't set before
@@ -500,7 +507,7 @@ TEST_P(MultiBitTest, Iter) {
     mmbit_clear(ba, test_size);
     ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -517,13 +524,13 @@ TEST_P(MultiBitTest, IterAll) {
     ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, MMB_INVALID));
 
     // Set all bits.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         mmbit_set(ba, test_size, i);
     }
 
     // Find all bits.
     u32 it = MMB_INVALID;
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         ASSERT_EQ(i, mmbit_iterate(ba, test_size, it));
         it = i;
     }
@@ -536,7 +543,7 @@ TEST_P(MultiBitTest, AnyPrecise) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any_precise(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -551,7 +558,7 @@ TEST_P(MultiBitTest, Any) {
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         mmbit_clear(ba, test_size);
         mmbit_set(ba, test_size, i);
@@ -567,7 +574,7 @@ TEST_P(MultiBitTest, UnsetRange1) {
     fill_mmbit(ba, test_size);
 
     // Use mmbit_unset_range to switch off any single bit.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_TRUE(mmbit_isset(ba, test_size, i));
         mmbit_unset_range(ba, test_size, i, i + 1);
@@ -590,7 +597,7 @@ TEST_P(MultiBitTest, UnsetRange2) {
     // Use mmbit_unset_range to switch off all bits.
     mmbit_unset_range(ba, test_size, 0, test_size);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_FALSE(mmbit_isset(ba, test_size, i));
     }
@@ -601,12 +608,12 @@ TEST_P(MultiBitTest, UnsetRange3) {
     ASSERT_TRUE(ba != nullptr);
 
     // Use mmbit_unset_range to switch off bits in chunks of 3.
-    for (u32 i = 0; i < test_size - 3; i += stride) {
+    for (u64a i = 0; i < test_size - 3; i += stride) {
         // Switch on the bit before, the bits in question, and the bit after.
         if (i > 0) {
             mmbit_set(ba, test_size, i - 1);
         }
-        for (u32 j = i; j < min(i + 4, test_size); j++) {
+        for (u64a j = i; j < min(i + 4, (u64a)test_size); j++) {
             mmbit_set(ba, test_size, j);
         }
 
@@ -635,7 +642,7 @@ TEST_P(MultiBitTest, InitRangeAll) {
     mmbit_init_range(ba, test_size, 0, test_size);
 
     // Make sure they're all set.
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         SCOPED_TRACE(i);
         ASSERT_TRUE(mmbit_isset(ba, test_size, i));
     }
@@ -656,7 +663,7 @@ TEST_P(MultiBitTest, InitRangeOne) {
     SCOPED_TRACE(test_size);
     ASSERT_TRUE(ba != nullptr);
 
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         mmbit_init_range(ba, test_size, i, i + 1);
 
         // Only bit 'i' should be on.
@@ -685,7 +692,7 @@ TEST_P(MultiBitTest, InitRangeChunked) {
             ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));
 
             // All bits in the chunk should be on.
-            for (u32 i = chunk_begin; i < chunk_end; i += stride) {
+            for (u64a i = chunk_begin; i < chunk_end; i += stride) {
                 SCOPED_TRACE(i);
                 ASSERT_TRUE(mmbit_isset(ba, test_size, i));
             }
@@ -985,7 +992,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1032,7 +1039,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
     // Switch every third bits on in state
     mmbit_clear(ba, test_size);
     ASSERT_FALSE(mmbit_any(ba, test_size));
-    for (u32 i = 0; i < test_size; i += 3) {
+    for (u64a i = 0; i < test_size; i += 3) {
         mmbit_set(ba, test_size, i);
     }
 
@@ -1044,7 +1051,7 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
     ASSERT_EQ(0U, val);
     ASSERT_EQ(0U, idx);
 
-    for (u32 i = 0; i < test_size - 3; i += 3) {
+    for (u64a i = 0; i < test_size - 3; i += 3) {
         mmbit_unset(ba, test_size, i);
         val = mmbit_sparse_iter_begin(ba, test_size, &idx, &it[0], &state[0]);
         ASSERT_EQ(i+3, val);
@@ -1060,7 +1067,7 @@ TEST_P(MultiBitTest, SparseIteratorNextAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1103,7 +1110,7 @@ TEST_P(MultiBitTest, SparseIteratorNextExactStrided) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
         mmbit_set(ba, test_size, i);
     }
@@ -1135,7 +1142,7 @@ TEST_P(MultiBitTest, SparseIteratorNextNone) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1164,7 +1171,7 @@ TEST_P(MultiBitTest, SparseIteratorUnsetAll) {
     vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
-    for (u32 i = 0; i < test_size; i += stride) {
+    for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
     mmbBuildSparseIterator(it, bits, test_size);
@@ -1194,10 +1201,10 @@ TEST_P(MultiBitTest, SparseIteratorUnsetHalves) {
 
     // Two sparse iterators: one for even bits, one for odd ones
     vector<u32> even, odd;
-    for (u32 i = 0; i < test_size; i += 2) {
+    for (u64a i = 0; i < test_size; i += 2) {
         even.push_back(i);
     }
-    for (u32 i = 1; i < test_size; i += 2) {
+    for (u64a i = 1; i < test_size; i += 2) {
         odd.push_back(i);
     }
 
@@ -1277,9 +1284,9 @@ static const MultiBitTestParam multibitTests[] = {
     { 1U << 28, 15073 },
     { 1U << 29, 24413 },
     { 1U << 30, 50377 },
+    { 1U << 31, 104729 },
 
-    // XXX: cases this large segfault in mmbit_set, FIXME NOW
-    //{ 1U << 31, 3701 },
+    // { UINT32_MAX, 104729 }, // Very slow
 };
 
 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));

From b87590ce9d01d4d09fc1fd1198aea1ff04225137 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 09:38:20 +1100
Subject: [PATCH 43/54] castle: simplify find_next_top

Tops are no longer sparse in CastleProto, so the linear scan for holes
isn't necessary.
---
 src/nfa/castlecompile.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index e7e40540..b18eb9ad 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -721,10 +721,8 @@ const CharReach &CastleProto::reach() const {
 
 static
 u32 find_next_top(const map<u32, PureRepeat> &repeats) {
-    u32 top = 0;
-    for (; contains(repeats, top); top++) {
-        // pass
-    }
+    u32 top = verify_u32(repeats.size());
+    assert(!contains(repeats, top));
     return top;
 }
 
@@ -734,7 +732,7 @@ u32 CastleProto::add(const PureRepeat &pr) {
     assert(pr.reports.size() == 1);
     u32 top = find_next_top(repeats);
     DEBUG_PRINTF("selected unused top %u\n", top);
-    repeats.insert(make_pair(top, pr));
+    repeats.emplace(top, pr);
     return top;
 }
 
@@ -783,7 +781,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
         const PureRepeat &pr = m.second;
         DEBUG_PRINTF("top %u\n", top);
         u32 new_top = find_next_top(c1.repeats);
-        c1.repeats.insert(make_pair(new_top, pr));
+        c1.repeats.emplace(new_top, pr);
         top_map[top] = new_top;
         DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
     }
@@ -800,7 +798,7 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
         u32 new_top = find_next_top(out);
-        out.insert(make_pair(new_top, pr));
+        out.emplace(new_top, pr);
         top_map[top] = new_top;
     }
 

From 1267922ca77182bb89ef1704f9d451eadd302239 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 09:47:59 +1100
Subject: [PATCH 44/54] role aliasing: simplify hashRightRoleProperties

Using the full report set for a suffix as an input to this hash was very
slow at scale.
---
 src/rose/rose_build_role_aliasing.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 87b6936c..62db5b2e 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -439,12 +439,16 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
     hash_combine(val, hash_range(begin(props.reports), end(props.reports)));
 
     if (props.suffix) {
-        hash_combine(val, all_reports(props.suffix));
-        if (props.suffix.graph) {
-            hash_combine(val, num_vertices(*props.suffix.graph));
+        const auto &suffix = props.suffix;
+        if (suffix.castle) {
+            hash_combine(val, suffix.castle->reach());
+            hash_combine(val, suffix.castle->repeats.size());
         }
-        if (props.suffix.haig) {
-            hash_combine(val, hash_dfa(*props.suffix.haig));
+        if (suffix.graph) {
+            hash_combine(val, num_vertices(*suffix.graph));
+        }
+        if (suffix.haig) {
+            hash_combine(val, hash_dfa(*suffix.haig));
         }
     }
 

From 03953f34b13c38f5597c800c206998a3f8a583bc Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 09:54:55 +1100
Subject: [PATCH 45/54] RoseDedupeAuxImpl: collect unique suffixes first

---
 src/rose/rose_build_misc.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index f627017c..1d58c241 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -601,6 +601,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     : tbi(tbi_in) {
     const RoseGraph &g = tbi.g;
 
+    set<suffix_id> suffixes;
+
     for (auto v : vertices_range(g)) {
         // Literals in the small block table don't count as dupes: although
         // they have copies in the anchored table, the two are never run in the
@@ -611,10 +613,16 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
             }
         }
 
+        // Several vertices may share a suffix, so we collect the set of
+        // suffixes first to avoid repeating work.
         if (g[v].suffix) {
-            for (const auto &report_id : all_reports(g[v].suffix)) {
-                suffix_map[report_id].insert(g[v].suffix);
-            }
+            suffixes.insert(g[v].suffix);
+        }
+    }
+
+    for (const auto &suffix : suffixes) {
+        for (const auto &report_id : all_reports(suffix)) {
+            suffix_map[report_id].insert(suffix);
         }
     }
 

From 8dac64d1dcad73a3bd3270215a9fc22e15ad49c8 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 10:24:54 +1100
Subject: [PATCH 46/54] findMinWidth, findMaxWidth: width for a given top

Currently only implemented for Castle suffixes.
---
 src/nfa/castlecompile.cpp     | 16 ++++++++++++++++
 src/nfa/castlecompile.h       |  2 ++
 src/rose/rose_build_impl.h    |  4 ++++
 src/rose/rose_build_misc.cpp  | 24 ++++++++++++++++++++++++
 src/rose/rose_build_width.cpp | 12 +++++++-----
 5 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index b18eb9ad..7d37ef81 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -708,6 +708,22 @@ depth findMaxWidth(const CastleProto &proto) {
     return max_width;
 }
 
+depth findMinWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth::infinity();
+    }
+    return proto.repeats.at(top).bounds.min;
+}
+
+depth findMaxWidth(const CastleProto &proto, u32 top) {
+    if (!contains(proto.repeats, top)) {
+        assert(0); // should not happen
+        return depth(0);
+    }
+    return proto.repeats.at(top).bounds.max;
+}
+
 CastleProto::CastleProto(const PureRepeat &pr) {
     assert(pr.reach.any());
     assert(pr.reports.size() == 1);
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index fbafb606..a35f229d 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -85,6 +85,8 @@ struct CastleProto {
 std::set<ReportID> all_reports(const CastleProto &proto);
 depth findMinWidth(const CastleProto &proto);
 depth findMaxWidth(const CastleProto &proto);
+depth findMinWidth(const CastleProto &proto, u32 top);
+depth findMaxWidth(const CastleProto &proto, u32 top);
 
 /**
  * \brief Remap tops to be contiguous.
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 26a7c606..3112d639 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -130,6 +130,8 @@ private:
 
     friend depth findMinWidth(const suffix_id &s);
     friend depth findMaxWidth(const suffix_id &s);
+    friend depth findMinWidth(const suffix_id &s, u32 top);
+    friend depth findMaxWidth(const suffix_id &s, u32 top);
 };
 
 std::set<ReportID> all_reports(const suffix_id &s);
@@ -138,6 +140,8 @@ bool has_eod_accepts(const suffix_id &s);
 bool has_non_eod_accepts(const suffix_id &s);
 depth findMinWidth(const suffix_id &s);
 depth findMaxWidth(const suffix_id &s);
+depth findMinWidth(const suffix_id &s, u32 top);
+depth findMaxWidth(const suffix_id &s, u32 top);
 size_t hash_value(const suffix_id &s);
 
 /** \brief represents an engine to the left of a rose role */
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 1d58c241..b8775912 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -907,6 +907,18 @@ depth findMinWidth(const suffix_id &s) {
     }
 }
 
+depth findMinWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    // TODO: take top into account for non-castle suffixes.
+    if (s.graph()) {
+        return findMinWidth(*s.graph());
+    } else if (s.castle()) {
+        return findMinWidth(*s.castle(), top);
+    } else {
+        return s.dfa_min_width;
+    }
+}
+
 depth findMaxWidth(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
     if (s.graph()) {
@@ -918,6 +930,18 @@ depth findMaxWidth(const suffix_id &s) {
     }
 }
 
+depth findMaxWidth(const suffix_id &s, u32 top) {
+    assert(s.graph() || s.castle() || s.haig() || s.dfa());
+    // TODO: take top into account for non-castle suffixes.
+    if (s.graph()) {
+        return findMaxWidth(*s.graph());
+    } else if (s.castle()) {
+        return findMaxWidth(*s.castle(), top);
+    } else {
+        return s.dfa_max_width;
+    }
+}
+
 bool has_eod_accepts(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
     if (s.graph()) {
diff --git a/src/rose/rose_build_width.cpp b/src/rose/rose_build_width.cpp
index 2a7f2bd6..6bfcee48 100644
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@@ -94,10 +94,11 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
         }
 
         if (g[v].suffix) {
-            depth suffix_width = findMinWidth(g[v].suffix);
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
             assert(suffix_width.is_reachable());
-            DEBUG_PRINTF("%zu has suffix (width %s), can fire report at %u\n",
-                         g[v].idx, suffix_width.str().c_str(),
+            DEBUG_PRINTF("%zu has suffix with top %u (width %s), can fire "
+                         "report at %u\n",
+                         g[v].idx, g[v].suffix.top, suffix_width.str().c_str(),
                          w + suffix_width);
             minWidth = min(minWidth, w + suffix_width);
         }
@@ -146,8 +147,9 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi) {
             if (has_non_eod_accepts(g[v].suffix)) {
                 return ROSE_BOUND_INF;
             }
-            depth suffix_width = findMaxWidth(g[v].suffix);
-            DEBUG_PRINTF("suffix max width %s\n", suffix_width.str().c_str());
+            depth suffix_width = findMaxWidth(g[v].suffix, g[v].suffix.top);
+            DEBUG_PRINTF("suffix max width for top %u is %s\n", g[v].suffix.top,
+                         suffix_width.str().c_str());
             assert(suffix_width.is_reachable());
             if (!suffix_width.is_finite()) {
                 DEBUG_PRINTF("suffix too wide\n");

From da23e8306afda2d5d4706296da177ce9089987a8 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 10:39:32 +1100
Subject: [PATCH 47/54] assignDkeys: use flat_set<ReportID>, not set

---
 src/nfa/castlecompile.cpp    |  3 ++-
 src/nfa/castlecompile.h      |  4 +++-
 src/rose/rose_build.h        |  5 +++--
 src/rose/rose_build_misc.cpp | 17 +++++++++--------
 src/util/report_manager.cpp  |  6 +++---
 5 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 7d37ef81..32dc8f24 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -894,7 +894,8 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
     return c1.repeats == c2.repeats;
 }
 
-bool requiresDedupe(const CastleProto &proto, const set<ReportID> &reports) {
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports) {
     ue2::unordered_set<ReportID> seen;
     for (const PureRepeat &pr : proto.repeats | map_values) {
         for (const ReportID &report : pr.reports) {
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index a35f229d..bb5eff59 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -38,6 +38,7 @@
 #include "nfagraph/ng_repeat.h"
 #include "util/alloc.h"
 #include "util/depth.h"
+#include "util/ue2_containers.h"
 
 #include <map>
 #include <memory>
@@ -135,7 +136,8 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2);
  * \brief True if the given castle contains more than a single instance of any
  * of the reports in the given set.
  */
-bool requiresDedupe(const CastleProto &proto, const std::set<ReportID> &reports);
+bool requiresDedupe(const CastleProto &proto,
+                    const ue2::flat_set<ReportID> &reports);
 
 /**
  * \brief Build an NGHolder from a CastleProto.
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index 1a1fc223..bef2114f 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -42,6 +42,7 @@
 #include "rose_in_graph.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
+#include "util/ue2_containers.h"
 #include "util/ue2string.h"
 
 #include <memory>
@@ -72,8 +73,8 @@ public:
 
     /** \brief True if we can not establish that at most a single callback will
      * be generated at a given offset from this set of reports. */
-    virtual bool requiresDedupeSupport(const std::set<ReportID> &reports) const
-        = 0;
+    virtual bool requiresDedupeSupport(const ue2::flat_set<ReportID> &reports)
+        const = 0;
 };
 
 /** \brief Abstract interface intended for callers from elsewhere in the tree,
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index b8775912..e54c0370 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -538,7 +538,7 @@ u32 RoseBuildImpl::getNewLiteralId() {
 }
 
 static
-bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
+bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                     const Grey &grey) {
     /* TODO: tighten */
     NFAVertex seen_vert = NFAGraph::null_vertex();
@@ -581,13 +581,14 @@ bool requiresDedupe(const NGHolder &h, const set<ReportID> &reports,
 class RoseDedupeAuxImpl : public RoseDedupeAux {
 public:
     explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
-    bool requiresDedupeSupport(const set<ReportID> &reports) const override;
+    bool requiresDedupeSupport(
+        const ue2::flat_set<ReportID> &reports) const override;
 
     const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex> > vert_map;
-    map<ReportID, set<suffix_id> > suffix_map;
-    map<ReportID, set<const OutfixInfo *> > outfix_map;
-    map<ReportID, set<const raw_puff *> > puff_map;
+    map<ReportID, set<RoseVertex>> vert_map;
+    map<ReportID, set<suffix_id>> suffix_map;
+    map<ReportID, set<const OutfixInfo *>> outfix_map;
+    map<ReportID, set<const raw_puff *>> puff_map;
 };
 
 unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
@@ -644,8 +645,8 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     }
 }
 
-bool RoseDedupeAuxImpl::requiresDedupeSupport(const set<ReportID> &reports)
-    const {
+bool RoseDedupeAuxImpl::requiresDedupeSupport(
+    const ue2::flat_set<ReportID> &reports) const {
     /* TODO: this could be expanded to check for offset or character
        constraints */
 
diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp
index eacef005..425f166d 100644
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@@ -128,11 +128,9 @@ vector<ReportID> ReportManager::getDkeyToReportTable() const {
 }
 
 void ReportManager::assignDkeys(const RoseBuild *rose) {
-    unique_ptr<RoseDedupeAux> dedupe = rose->generateDedupeAux();
-
     DEBUG_PRINTF("assigning...\n");
 
-    map<u32, set<ReportID>> ext_to_int;
+    map<u32, ue2::flat_set<ReportID>> ext_to_int;
 
     for (u32 i = 0; i < reportIds.size(); i++) {
         const Report &ir = reportIds[i];
@@ -143,6 +141,8 @@ void ReportManager::assignDkeys(const RoseBuild *rose) {
         }
     }
 
+    auto dedupe = rose->generateDedupeAux();
+
     for (const auto &m : ext_to_int) {
         u32 ext = m.first;
 

From 8427d837809ed0afcdc8db8d54d79d9b0b4c47b4 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 11:31:09 +1100
Subject: [PATCH 48/54] CastleProto: track mapping of reports to tops

This allows us to speed up report-based queries, like dedupe checking.
---
 src/nfa/castlecompile.cpp | 48 ++++++++++++++++++++++++++-------------
 src/nfa/castlecompile.h   |  3 +++
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 32dc8f24..8d7f3913 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -58,6 +58,7 @@
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
+using boost::adaptors::map_keys;
 using boost::adaptors::map_values;
 
 namespace ue2 {
@@ -686,8 +687,8 @@ buildCastle(const CastleProto &proto,
 
 set<ReportID> all_reports(const CastleProto &proto) {
     set<ReportID> reports;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        reports.insert(pr.reports.begin(), pr.reports.end());
+    for (const ReportID &report : proto.report_map | map_keys) {
+        reports.insert(report);
     }
     return reports;
 }
@@ -727,7 +728,11 @@ depth findMaxWidth(const CastleProto &proto, u32 top) {
 CastleProto::CastleProto(const PureRepeat &pr) {
     assert(pr.reach.any());
     assert(pr.reports.size() == 1);
-    repeats.insert(make_pair(0, pr));
+    u32 top = 0;
+    repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
 }
 
 const CharReach &CastleProto::reach() const {
@@ -749,6 +754,9 @@ u32 CastleProto::add(const PureRepeat &pr) {
     u32 top = find_next_top(repeats);
     DEBUG_PRINTF("selected unused top %u\n", top);
     repeats.emplace(top, pr);
+    for (const auto &report : pr.reports) {
+        report_map[report].insert(top);
+    }
     return top;
 }
 
@@ -796,8 +804,7 @@ bool mergeCastle(CastleProto &c1, const CastleProto &c2,
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
         DEBUG_PRINTF("top %u\n", top);
-        u32 new_top = find_next_top(c1.repeats);
-        c1.repeats.emplace(new_top, pr);
+        u32 new_top = c1.add(pr);
         top_map[top] = new_top;
         DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
     }
@@ -819,6 +826,17 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
     }
 
     proto.repeats.swap(out);
+
+    // Remap report map.
+    proto.report_map.clear();
+    for (const auto &m : proto.repeats) {
+        const u32 top = m.first;
+        const PureRepeat &pr = m.second;
+        for (const auto &report : pr.reports) {
+            proto.report_map[report].insert(top);
+        }
+    }
+
     assert(proto.repeats.size() <= proto.max_occupancy);
 }
 
@@ -896,17 +914,15 @@ bool is_equal(const CastleProto &c1, const CastleProto &c2) {
 
 bool requiresDedupe(const CastleProto &proto,
                     const ue2::flat_set<ReportID> &reports) {
-    ue2::unordered_set<ReportID> seen;
-    for (const PureRepeat &pr : proto.repeats | map_values) {
-        for (const ReportID &report : pr.reports) {
-            if (contains(reports, report)) {
-                if (contains(seen, report)) {
-                    DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
-                                 report);
-                    return true;
-                }
-                seen.insert(report);
-            }
+    for (const auto &report : reports) {
+        auto it = proto.report_map.find(report);
+        if (it == end(proto.report_map)) {
+            continue;
+        }
+        if (it->second.size() > 1) {
+            DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
+                         report);
+            return true;
         }
     }
     return false;
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index bb5eff59..8ca3581c 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -81,6 +81,9 @@ struct CastleProto {
 
     /** \brief Mapping from unique top id to repeat. */
     std::map<u32, PureRepeat> repeats;
+
+    /** \brief Mapping from report to associated tops. */
+    ue2::unordered_map<ReportID, flat_set<u32>> report_map;
 };
 
 std::set<ReportID> all_reports(const CastleProto &proto);

From 748d46c12420d7cb3b7b2805283fd52abdbe158d Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 3 Dec 2015 09:27:57 +1100
Subject: [PATCH 49/54] CastleProto: track next top explicitly

Repeats may be removed (e.g. by pruning in role aliasing passes)
leaving "holes" in the top map. Track the next top to use explicitly,
rather than using repeats.size().
---
 src/nfa/castlecompile.cpp             | 21 ++++++++++++---------
 src/nfa/castlecompile.h               | 10 ++++++++++
 src/rose/rose_build_role_aliasing.cpp | 17 ++++++++++-------
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 8d7f3913..cc5c599b 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -740,19 +740,13 @@ const CharReach &CastleProto::reach() const {
     return repeats.begin()->second.reach;
 }
 
-static
-u32 find_next_top(const map<u32, PureRepeat> &repeats) {
-    u32 top = verify_u32(repeats.size());
-    assert(!contains(repeats, top));
-    return top;
-}
-
 u32 CastleProto::add(const PureRepeat &pr) {
     assert(repeats.size() < max_occupancy);
     assert(pr.reach == reach());
     assert(pr.reports.size() == 1);
-    u32 top = find_next_top(repeats);
+    u32 top = next_top++;
     DEBUG_PRINTF("selected unused top %u\n", top);
+    assert(!contains(repeats, top));
     repeats.emplace(top, pr);
     for (const auto &report : pr.reports) {
         report_map[report].insert(top);
@@ -760,6 +754,15 @@ u32 CastleProto::add(const PureRepeat &pr) {
     return top;
 }
 
+void CastleProto::erase(u32 top) {
+    DEBUG_PRINTF("erase top %u\n", top);
+    assert(contains(repeats, top));
+    repeats.erase(top);
+    for (auto &m : report_map) {
+        m.second.erase(top);
+    }
+}
+
 u32 CastleProto::merge(const PureRepeat &pr) {
     assert(repeats.size() <= max_occupancy);
     assert(pr.reach == reach());
@@ -820,7 +823,7 @@ void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
     for (const auto &m : proto.repeats) {
         const u32 top = m.first;
         const PureRepeat &pr = m.second;
-        u32 new_top = find_next_top(out);
+        u32 new_top = out.size();
         out.emplace(new_top, pr);
         top_map[top] = new_top;
     }
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index 8ca3581c..fc4bb991 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -68,8 +68,12 @@ struct CastleProto {
     explicit CastleProto(const PureRepeat &pr);
     const CharReach &reach() const;
 
+    /** \brief Add a new repeat. */
     u32 add(const PureRepeat &pr);
 
+    /** \brief Remove a repeat. */
+    void erase(u32 top);
+
     /**
      * \brief Merge in the given repeat, returning the top used.
      *
@@ -84,6 +88,12 @@ struct CastleProto {
 
     /** \brief Mapping from report to associated tops. */
     ue2::unordered_map<ReportID, flat_set<u32>> report_map;
+
+    /**
+     * \brief Next top id to use. Repeats may be removed without top remapping,
+     * so we track this explicitly instead of using repeats.size().
+     */
+    u32 next_top = 1;
 };
 
 std::set<ReportID> all_reports(const CastleProto &proto);
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 62db5b2e..88deaa25 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -751,14 +751,17 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
  * Castle. */
 static
 void pruneCastle(CastleProto &castle, ReportID report) {
-    for (map<u32, PureRepeat>::iterator it = castle.repeats.begin();
-         it != castle.repeats.end(); /* incr inside */) {
-        if (contains(it->second.reports, report)) {
-            ++it;
-        } else {
-            castle.repeats.erase(it++);
+    unordered_set<u32> dead; // tops to remove.
+    for (const auto &m : castle.repeats) {
+        if (!contains(m.second.reports, report)) {
+            dead.insert(m.first);
         }
     }
+
+    for (const auto &top : dead) {
+        castle.erase(top);
+    }
+
     assert(!castle.repeats.empty());
 }
 
@@ -798,7 +801,7 @@ void pruneUnusedTops(CastleProto &castle, const RoseGraph &g,
     for (u32 top : assoc_keys(castle.repeats)) {
         if (!contains(used_tops, top)) {
             DEBUG_PRINTF("removing unused top %u\n", top);
-            castle.repeats.erase(top);
+            castle.erase(top);
         }
     }
 }

From 8c09d054c9035966d45a1b09504e090f4b2924aa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Dec 2015 18:16:49 +1100
Subject: [PATCH 50/54] Add per-top findMinWidth etc for NFA graphs

---
 src/nfagraph/ng_width.cpp    | 68 ++++++++++++++++++++++++++----------
 src/nfagraph/ng_width.h      | 28 ++++++++++++---
 src/rose/rose_build_misc.cpp |  6 ++--
 3 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp
index 3ecb391c..470f9343 100644
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@@ -51,10 +51,16 @@ namespace ue2 {
 
 namespace {
 
-/** Filter out edges from start-to-start or accept-to-accept. */
+/**
+ * Filter out special edges, or in the top-specific variant, start edges that
+ * don't have the right top set.
+ */
 struct SpecialEdgeFilter {
     SpecialEdgeFilter() {}
-    explicit SpecialEdgeFilter(const NGHolder *h_in) : h(h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in) : h(&h_in) {}
+    explicit SpecialEdgeFilter(const NGHolder &h_in, u32 top_in)
+        : h(&h_in), single_top(true), top(top_in) {}
+
     bool operator()(const NFAEdge &e) const {
         const NFAGraph &g = h->g;
         NFAVertex u = source(e, g), v = target(e, g);
@@ -62,23 +68,33 @@ struct SpecialEdgeFilter {
             (is_any_accept(u, g) && is_any_accept(v, g))) {
             return false;
         }
+        if (single_top) {
+            if (u == h->start && g[e].top != top) {
+                return false;
+            }
+            if (u == h->startDs) {
+                return false;
+            }
+        }
         return true;
 
     }
 private:
     const NGHolder *h = nullptr;
+    bool single_top = false;
+    u32 top = 0;
 };
 
 } // namespace
 
 static
-depth findMinWidth(const NGHolder &h, NFAVertex src) {
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
     if (isLeafNode(src, h)) {
         return depth::unreachable();
     }
 
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> StartGraph;
-    StartGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -112,7 +128,8 @@ depth findMinWidth(const NGHolder &h, NFAVertex src) {
 }
 
 static
-depth findMaxWidth(const NGHolder &h, NFAVertex src) {
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
+                   NFAVertex src) {
     if (isLeafNode(src, h.g)) {
         return depth::unreachable();
     }
@@ -122,8 +139,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
         return depth::infinity();
     }
 
-    typedef boost::filtered_graph<NFAGraph, SpecialEdgeFilter> NodeFilteredGraph;
-    NodeFilteredGraph g(h.g, SpecialEdgeFilter(&h));
+    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);
 
     assert(hasCorrectlyNumberedVertices(h));
     const size_t num = num_vertices(h);
@@ -164,7 +180,7 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
     if (d.is_unreachable()) {
         // If we're actually reachable, we'll have a min width, so we can
         // return infinity in this case.
-        if (findMinWidth(h, src).is_reachable()) {
+        if (findMinWidth(h, filter, src).is_reachable()) {
             return depth::infinity();
         }
         return d;
@@ -175,11 +191,10 @@ depth findMaxWidth(const NGHolder &h, NFAVertex src) {
     return d - depth(1);
 }
 
-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
-depth findMinWidth(const NGHolder &h) {
-    depth startDepth = findMinWidth(h, h.start);
-    depth dotstarDepth = findMinWidth(h, h.startDs);
+static
+depth findMinWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMinWidth(h, filter, h.start);
+    depth dotstarDepth = findMinWidth(h, filter, h.startDs);
     DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                  dotstarDepth.str().c_str());
     if (startDepth.is_unreachable()) {
@@ -194,11 +209,18 @@ depth findMinWidth(const NGHolder &h) {
     }
 }
 
-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
-depth findMaxWidth(const NGHolder &h) {
-    depth startDepth = findMaxWidth(h, h.start);
-    depth dotstarDepth = findMaxWidth(h, h.startDs);
+depth findMinWidth(const NGHolder &h) {
+    return findMinWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMinWidth(const NGHolder &h, u32 top) {
+    return findMinWidth(h, SpecialEdgeFilter(h, top));
+}
+
+static
+depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter) {
+    depth startDepth = findMaxWidth(h, filter, h.start);
+    depth dotstarDepth = findMaxWidth(h, filter, h.startDs);
     DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                  dotstarDepth.str().c_str());
     if (startDepth.is_unreachable()) {
@@ -210,4 +232,12 @@ depth findMaxWidth(const NGHolder &h) {
     }
 }
 
+depth findMaxWidth(const NGHolder &h) {
+    return findMaxWidth(h, SpecialEdgeFilter(h));
+}
+
+depth findMaxWidth(const NGHolder &h, u32 top) {
+    return findMaxWidth(h, SpecialEdgeFilter(h, top));
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_width.h b/src/nfagraph/ng_width.h
index 5c78409b..871e8a93 100644
--- a/src/nfagraph/ng_width.h
+++ b/src/nfagraph/ng_width.h
@@ -41,14 +41,34 @@ namespace ue2 {
 
 class NGHolder;
 
-/** Returns the minimum width in bytes of an input that will match the given
- * graph. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph.
+ */
 depth findMinWidth(const NGHolder &h);
 
-/** Returns the maximum width in bytes of an input that will match the given
- * graph. If there is no maximum width, returns infinity. */
+/**
+ * \brief Compute the minimum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ */
+depth findMinWidth(const NGHolder &h, u32 top);
+
+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
 depth findMaxWidth(const NGHolder &h);
 
+/**
+ * \brief Compute the maximum width in bytes of an input that will match the
+ * given graph, considering only paths activated by the given top.
+ *
+ * If there is no bound on the maximum width, returns infinity.
+ */
+depth findMaxWidth(const NGHolder &h, u32 top);
+
 } // namespace ue2
 
 #endif // NG_WIDTH_H
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index e54c0370..cc5bbc70 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -910,9 +910,8 @@ depth findMinWidth(const suffix_id &s) {
 
 depth findMinWidth(const suffix_id &s, u32 top) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
-    // TODO: take top into account for non-castle suffixes.
     if (s.graph()) {
-        return findMinWidth(*s.graph());
+        return findMinWidth(*s.graph(), top);
     } else if (s.castle()) {
         return findMinWidth(*s.castle(), top);
     } else {
@@ -933,9 +932,8 @@ depth findMaxWidth(const suffix_id &s) {
 
 depth findMaxWidth(const suffix_id &s, u32 top) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
-    // TODO: take top into account for non-castle suffixes.
     if (s.graph()) {
-        return findMaxWidth(*s.graph());
+        return findMaxWidth(*s.graph(), top);
     } else if (s.castle()) {
         return findMaxWidth(*s.castle(), top);
     } else {

From 7bcd2b07c92a56ebc7dd537ccfc361d2e0fa824e Mon Sep 17 00:00:00 2001
From: Xiang Wang <xiang.w.wang@intel.com>
Date: Wed, 2 Dec 2015 07:24:57 -0500
Subject: [PATCH 51/54] simplify max clique analysis

---
 src/nfa/castlecompile.cpp | 228 +++++++++-----------------------------
 1 file changed, 53 insertions(+), 175 deletions(-)

diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index cc5c599b..e5cc9267 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -64,6 +64,7 @@ using boost::adaptors::map_values;
 namespace ue2 {
 
 #define CASTLE_MAX_TOPS 32
+#define CLIQUE_GRAPH_MAX_SIZE 1000
 
 static
 u32 depth_to_u32(const depth &d) {
@@ -107,209 +108,90 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
 }
 
 static
-size_t literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b) {
+bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
+                    const size_t dist) {
     for (size_t i = 0; i < b.size(); i++) {
+        if (i > dist) {
+            return true;
+        }
         size_t overlap_len = b.size() - i;
         if (overlap_len <= a.size()) {
             if (matches(a.end() - overlap_len, a.end(), b.begin(),
                         b.end() - i)) {
-                return i;
+                return false;
             }
         } else {
             assert(overlap_len > a.size());
             if (matches(a.begin(), a.end(), b.end() - i - a.size(),
                         b.end() - i)) {
-                return i;
+                return false;
             }
         }
     }
 
-    return b.size();
+    return b.size() > dist;
 }
 
-//  UE-2666 case 1: The problem of find largest exclusive subcastles group
-//  can be reformulated as finding the largest clique (subgraph where every
-//  vertex is connected to every other vertex) in the graph. We use an
-//  approximate algorithm here to find the maximum clique.
-//  References
-//  ----------
-//      [1] Boppana, R., & Halldórsson, M. M. (1992).
-//      Approximating maximum independent sets by excluding subgraphs.
-//      BIT Numerical Mathematics, 32(2), 180–196. Springer.
-//      doi:10.1007/BF01994876
-//  ----------
-
 struct CliqueVertexProps {
     CliqueVertexProps() {}
     explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
 
     u32 stateId = ~0U;
-    u32 parentId = ~0U;
-    bool leftChild = false; /* tells us if it is the left child of its parent */
-
-    vector<u32> clique1; /* clique for the left branch */
-    vector<u32> indepSet1; /* independent set for the left branch */
-    vector<u32> clique2; /* clique for the right branch */
-    vector<u32> indepSet2; /* independent set for the right branch */
 };
 
 typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
                               CliqueVertexProps> CliqueGraph;
 typedef CliqueGraph::vertex_descriptor CliqueVertex;
 
-static
-unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
-    u32 size = exclusiveSet.size();
-
-    vector<CliqueVertex> vertices;
-    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-        vertices.push_back(v);
-    }
-
-    // construct the complement graph, then its maximum independent sets
-    // are equal to the maximum clique of the original graph
-    for (u32 i = 0; i < size; ++i) {
-        CliqueVertex s = vertices[i];
-        vector<u32> complement(size, 0);
-        for (u32 j = 0; j < exclusiveSet[i].size(); ++j) {
-            u32 val = exclusiveSet[i][j];
-            complement[val] = 1;
-        }
-
-        for (u32 k = i + 1; k < size; ++k) {
-             if (!complement[k]) {
-                CliqueVertex d = vertices[k];
-                add_edge(s, d, *cg);
-             }
-        }
-    }
-    return cg;
-}
-
-static
-void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
-                      vector<u32> &clique, vector<u32> &indepSet) {
-    u32 id = cg[n].stateId;
-    if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) {
-        cg[n].clique1.push_back(id);
-        clique.swap(cg[n].clique1);
-    } else {
-        clique.swap(cg[n].clique2);
-    }
-
-    if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) {
-        cg[n].indepSet2.push_back(id);
-        indepSet.swap(cg[n].indepSet2);
-    } else {
-        indepSet.swap(cg[n].indepSet1);
-    }
-}
-
 static
 void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
-                     vector<u32> &nonneighbor, const CliqueVertex &cv,
-                     const set<u32> &group) {
+                     const CliqueVertex &cv, const set<u32> &group) {
     u32 id = g[cv].stateId;
     ue2::unordered_set<u32> neighborId;
 
     // find neighbors for cv
     for (const auto &v : adjacent_vertices_range(cv, g)) {
-        if (g[v].stateId != id && contains(group, g[v].stateId)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
             neighbor.push_back(g[v].stateId);
             neighborId.insert(g[v].stateId);
-        }
-    }
-
-    neighborId.insert(id);
-    // find non-neighbors for cv
-    for (const auto &v : vertices_range(g)) {
-        if (!contains(neighborId, g[v].stateId) &&
-            contains(group, g[v].stateId)) {
-            nonneighbor.push_back(g[v].stateId);
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
         }
     }
 }
 
 static
-void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique,
-                     vector<u32> &indepSet) {
+void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
     stack<vector<u32>> gStack;
 
-    // create mapping between vertex and id
+    // Create mapping between vertex and id
     map<u32, CliqueVertex> vertexMap;
     vector<u32> init;
-    for (auto &v : vertices_range(cg)) {
+    for (const auto &v : vertices_range(cg)) {
         vertexMap[cg[v].stateId] = v;
         init.push_back(cg[v].stateId);
     }
     gStack.push(init);
 
-    // get the vertex to start from
-    set<u32> foundVertexId;
-    ue2::unordered_set<u32> visitedId;
+    // Get the vertex to start from
     CliqueGraph::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(cg);
-    CliqueVertex start = *vi;
-    u32 startId = cg[start].stateId;
-    DEBUG_PRINTF("startId:%u\n", startId);
-    bool leftChild = false;
-    u32 prevId = startId;
     while (!gStack.empty()) {
-        const auto &g = gStack.top();
+        vector<u32> g = gStack.top();
+        gStack.pop();
 
-        // choose a vertex from the graph
-        assert(!g.empty());
+        // Choose a vertex from the graph
         u32 id = g[0];
-        CliqueVertex &n = vertexMap.at(id);
-
+        const CliqueVertex &n = vertexMap.at(id);
+        clique.push_back(id);
+        // Corresponding vertex in the original graph
         vector<u32> neighbor;
-        vector<u32> nonneighbor;
         set<u32> subgraphId(g.begin(), g.end());
-        getNeighborInfo(cg, neighbor, nonneighbor, n, subgraphId);
-        if (contains(foundVertexId, id)) {
-            prevId = id;
-            // get non-neighbors for right branch
-            if (visitedId.insert(id).second) {
-                DEBUG_PRINTF("right branch\n");
-                if (!nonneighbor.empty()) {
-                    gStack.push(nonneighbor);
-                    leftChild = false;
-                }
-            } else {
-                if (id != startId) {
-                    // both the left and right branches are visited,
-                    // update its parent's clique and independent sets
-                    u32 parentId = cg[n].parentId;
-                    CliqueVertex &parent = vertexMap.at(parentId);
-                    if (cg[n].leftChild) {
-                        updateCliqueInfo(cg, n, cg[parent].clique1,
-                                         cg[parent].indepSet1);
-                    } else {
-                        updateCliqueInfo(cg, n, cg[parent].clique2,
-                                         cg[parent].indepSet2);
-                    }
-                }
-                gStack.pop();
-            }
-        } else {
-            foundVertexId.insert(id);
-            cg[n].leftChild = leftChild;
-            cg[n].parentId = prevId;
-            cg[n].clique1.clear();
-            cg[n].clique2.clear();
-            cg[n].indepSet1.clear();
-            cg[n].indepSet2.clear();
-            // get neighbors for left branch
-            if (!neighbor.empty()) {
-                gStack.push(neighbor);
-                leftChild = true;
-            }
-            prevId = id;
+        getNeighborInfo(cg, neighbor, n, subgraphId);
+        // Get graph consisting of neighbors for left branch
+        if (!neighbor.empty()) {
+            gStack.push(neighbor);
         }
     }
-    updateCliqueInfo(cg, start, clique, indepSet);
 }
 
 template<typename Graph>
@@ -322,9 +204,8 @@ bool graph_empty(const Graph &g) {
 static
 vector<u32> removeClique(CliqueGraph &cg) {
     vector<vector<u32>> cliquesVec(1);
-    vector<vector<u32>> indepSetsVec(1);
     DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
-    findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]);
+    findCliqueGroup(cg, cliquesVec[0]);
     while (!graph_empty(cg)) {
         const vector<u32> &c = cliquesVec.back();
         vector<CliqueVertex> dead;
@@ -341,30 +222,22 @@ vector<u32> removeClique(CliqueGraph &cg) {
             break;
         }
         vector<u32> clique;
-        vector<u32> indepSet;
-        findCliqueGroup(cg, clique, indepSet);
+        findCliqueGroup(cg, clique);
         cliquesVec.push_back(clique);
-        indepSetsVec.push_back(indepSet);
     }
 
     // get the independent set with max size
     size_t max = 0;
     size_t id = 0;
-    for (size_t j = 0; j < indepSetsVec.size(); ++j) {
-        if (indepSetsVec[j].size() > max) {
-            max = indepSetsVec[j].size();
+    for (size_t j = 0; j < cliquesVec.size(); ++j) {
+        if (cliquesVec[j].size() > max) {
+            max = cliquesVec[j].size();
             id = j;
         }
     }
 
-    DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size());
-    return indepSetsVec[id];
-}
-
-static
-vector<u32> findMaxClique(const vector<vector<u32>> &exclusiveSet) {
-    auto cg = makeCG(exclusiveSet);
-    return removeClique(*cg);
+    DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
+    return cliquesVec[id];
 }
 
 // if the location of any reset character in one literal are after
@@ -378,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2,
     const auto &triggers2 = triggers[id2];
     for (u32 i = 0; i < triggers1.size(); ++i) {
         for (u32 j = 0; j < triggers2.size(); ++j) {
-            size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]);
-            size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]);
-            if (max_overlap1 <= min_reset_dist[id2][j] ||
-                max_overlap2 <= min_reset_dist[id1][i]) {
+            if (!literalOverlap(triggers1[i], triggers2[j],
+                                min_reset_dist[id2][j]) ||
+                !literalOverlap(triggers2[j], triggers1[i],
+                                min_reset_dist[id1][i])) {
                 return false;
             }
         }
@@ -397,28 +270,33 @@ vector<u32> checkExclusion(const CharReach &cr,
         return group;
     }
 
-    vector<vector<size_t> > min_reset_dist;
+    vector<vector<size_t>> min_reset_dist;
     // get min reset distance for each repeat
     for (auto it = triggers.begin(); it != triggers.end(); it++) {
         const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr);
         min_reset_dist.push_back(tmp_dist);
     }
 
-    vector<vector<u32>> exclusiveSet;
+    vector<CliqueVertex> vertices;
+    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+    for (u32 i = 0; i < triggers.size(); ++i) {
+        CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
+        vertices.push_back(v);
+    }
+
     // find exclusive pair for each repeat
     for (u32 i = 0; i < triggers.size(); ++i) {
-        vector<u32> repeatIds;
+        CliqueVertex s = vertices[i];
         for (u32 j = i + 1; j < triggers.size(); ++j) {
             if (findExclusivePair(i, j, min_reset_dist, triggers)) {
-                repeatIds.push_back(j);
+                CliqueVertex d = vertices[j];
+                add_edge(s, d, *cg);
             }
         }
-        exclusiveSet.push_back(repeatIds);
-        DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size());
     }
 
     // find the largest exclusive group
-    return findMaxClique(exclusiveSet);
+    return removeClique(*cg);
 }
 
 static
@@ -576,7 +454,7 @@ buildCastle(const CastleProto &proto,
 
         repeatInfoPair.push_back(make_pair(min_period, is_reset));
 
-        if (is_reset) {
+        if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
             candidateTriggers.push_back(triggers.at(top));
             candidateRepeats.push_back(i);
         }
@@ -585,7 +463,7 @@ buildCastle(const CastleProto &proto,
     // Case 1: exclusive repeats
     bool exclusive = false;
     bool pureExclusive = false;
-    u8 activeIdxSize = 0;
+    u32 activeIdxSize = 0;
     set<u32> exclusiveGroup;
     if (cc.grey.castleExclusive) {
         vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
@@ -594,7 +472,7 @@ buildCastle(const CastleProto &proto,
             // Case 1: mutual exclusive repeats group found, initialize state
             // sizes
             exclusive = true;
-            activeIdxSize = calcPackedBytes(exclusiveSize);
+            activeIdxSize = calcPackedBytes(numRepeats + 1);
             if (exclusiveSize == numRepeats) {
                 pureExclusive = true;
                 streamStateSize = 0;
@@ -642,7 +520,7 @@ buildCastle(const CastleProto &proto,
     c->numRepeats = verify_u32(subs.size());
     c->exclusive = exclusive;
     c->pureExclusive = pureExclusive;
-    c->activeIdxSize = activeIdxSize;
+    c->activeIdxSize = verify_u8(activeIdxSize);
 
     writeCastleScanEngine(cr, c);
 

From 2aa6830c8887e11040bd5c4c7446ac5b62e0eff9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 16 Dec 2015 14:10:46 +1100
Subject: [PATCH 52/54] Add ChangeLog

---
 CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..c137017a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,36 @@
+# Hyperscan Change Log
+
+This is a list of notable changes to Hyperscan, in reverse chronological order.
+
+## [4.1.0] 2015-12-18
+- Update version of PCRE used by testing tools as a syntax and semantic
+  reference to PCRE 8.38.
+- Small updates to fix warnings identified by Coverity.
+- Clean up and unify exception handling behaviour across GPR and SIMD NFA
+  models.
+- Fix bug in handling of bounded repeat triggers with large gaps between them
+  for sparse repeat model.
+- Correctly reject POSIX collating elements (`[.ch.]`, `[=ch=]`) in the parser.
+  These are not supported by Hyperscan.
+- Add support for quoted sequences (`\Q...\E`) inside character classes.
+- Simplify FDR literal matcher runtime by removing some static specialization.
+- Fix handling of the POSIX `[:graph:]`, `[:print:]` and `[:punct:]` character
+  classes to match the behaviour of PCRE 8.38 in both standard operation and
+  with the UCP flag set. (Note: some bugs were fixed in this area in PCRE
+  8.38.) Previously Hyperscan's behaviour was the same as versions of PCRE
+  before 8.34.
+- Improve performance when compiling pattern sets that include a large number
+  of similar bounded repeat constructs. (github issue #9)
+
+## [4.0.1] 2015-10-30
+- Minor cleanups to test code.
+- CMake and other build system improvements.
+- API update: allow `hs_reset_stream()` and `hs_reset_and_copy_stream()` to be
+  supplied with a NULL scratch pointer if no matches are required. This is in
+  line with the behaviour of `hs_close_stream()`.
+- Disallow bounded repeats with a very large minimum repeat but no maximum,
+  i.e. {N,} for very large N.
+- Reduce compile memory usage in literal set explansion for some large cases.
+
+## [4.0.0] 2015-10-20
+- Original release of Hyperscan as open-source software.

From 0f2cbb9ffd395bd984679015221587868b8d40dd Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 16 Dec 2015 14:31:43 +1100
Subject: [PATCH 53/54] Small updates to documentation for 4.1

---
 README.md                         | 21 +++++++++++++++++++++
 doc/dev-reference/compilation.rst |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 37fc38ce..1753ecbe 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,24 @@ the [Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/)
 Hyperscan is licensed under the BSD License. See the LICENSE file in the
 project repository.
 
+# Versioning
+
+The `master` branch on Github will always contain the most recent release of
+Hyperscan. Each version released to `master` goes through QA and testing before
+it is released; if you're a user, rather than a developer, this is the version
+you should be using.
+
+Further development towards the next release takes place on the `develop`
+branch.
+
+# Get Involved
+
+The official homepage for Hyperscan is at [01.org/hyperscan](https://01.org/hyperscan).
+
+If you have questions or comments, we encourage you to [join the mailing
+list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
+sending email to the list, or by creating an issue on Github.
+
+If you wish to contact the Hyperscan team at Intel directly, without posting
+publicly to the mailing list, send email to
+[hyperscan@intel.com](mailto:hyperscan@intel.com).
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 6e195f6a..f3723dc9 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -63,6 +63,9 @@ described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 
+The version of PCRE used to validate Hyperscan's interpretation of this syntax
+is 8.38.
+
 ====================
 Supported Constructs
 ====================

From a5944067d40945de886bb2f2706b9038abc9f4dc Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Fri, 18 Dec 2015 14:37:29 +1100
Subject: [PATCH 54/54] Bump version number

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1dfcaf2..b4d81754 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 0)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 1)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")